1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/sched/mm.h>
44#include <linux/sched/coredump.h>
45#include <linux/sched/numa_balancing.h>
46#include <linux/sched/task.h>
47#include <linux/hugetlb.h>
48#include <linux/mman.h>
49#include <linux/swap.h>
50#include <linux/highmem.h>
51#include <linux/pagemap.h>
52#include <linux/memremap.h>
53#include <linux/ksm.h>
54#include <linux/rmap.h>
55#include <linux/export.h>
56#include <linux/delayacct.h>
57#include <linux/init.h>
58#include <linux/pfn_t.h>
59#include <linux/writeback.h>
60#include <linux/memcontrol.h>
61#include <linux/mmu_notifier.h>
62#include <linux/swapops.h>
63#include <linux/elf.h>
64#include <linux/gfp.h>
65#include <linux/migrate.h>
66#include <linux/string.h>
67#include <linux/dma-debug.h>
68#include <linux/debugfs.h>
69#include <linux/userfaultfd_k.h>
70#include <linux/dax.h>
71#include <linux/oom.h>
72
73#include <asm/io.h>
74#include <asm/mmu_context.h>
75#include <asm/pgalloc.h>
76#include <linux/uaccess.h>
77#include <asm/tlb.h>
78#include <asm/tlbflush.h>
79#include <asm/pgtable.h>
80
81#include "internal.h"
82
83#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
84#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
85#endif
86
87#ifndef CONFIG_NEED_MULTIPLE_NODES
88
89unsigned long max_mapnr;
90EXPORT_SYMBOL(max_mapnr);
91
92struct page *mem_map;
93EXPORT_SYMBOL(mem_map);
94#endif
95
96
97
98
99
100
101
102
103void *high_memory;
104EXPORT_SYMBOL(high_memory);
105
106
107
108
109
110
111
112int randomize_va_space __read_mostly =
113#ifdef CONFIG_COMPAT_BRK
114 1;
115#else
116 2;
117#endif
118
119static int __init disable_randmaps(char *s)
120{
121 randomize_va_space = 0;
122 return 1;
123}
124__setup("norandmaps", disable_randmaps);
125
126unsigned long zero_pfn __read_mostly;
127EXPORT_SYMBOL(zero_pfn);
128
129unsigned long highest_memmap_pfn __read_mostly;
130
131
132
133
134static int __init init_zero_pfn(void)
135{
136 zero_pfn = page_to_pfn(ZERO_PAGE(0));
137 return 0;
138}
139core_initcall(init_zero_pfn);
140
141
142#if defined(SPLIT_RSS_COUNTING)
143
144void sync_mm_rss(struct mm_struct *mm)
145{
146 int i;
147
148 for (i = 0; i < NR_MM_COUNTERS; i++) {
149 if (current->rss_stat.count[i]) {
150 add_mm_counter(mm, i, current->rss_stat.count[i]);
151 current->rss_stat.count[i] = 0;
152 }
153 }
154 current->rss_stat.events = 0;
155}
156
157static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
158{
159 struct task_struct *task = current;
160
161 if (likely(task->mm == mm))
162 task->rss_stat.count[member] += val;
163 else
164 add_mm_counter(mm, member, val);
165}
166#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
167#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
168
169
170#define TASK_RSS_EVENTS_THRESH (64)
171static void check_sync_rss_stat(struct task_struct *task)
172{
173 if (unlikely(task != current))
174 return;
175 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
176 sync_mm_rss(task->mm);
177}
178#else
179
180#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
181#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
182
183static void check_sync_rss_stat(struct task_struct *task)
184{
185}
186
187#endif
188
189#ifdef HAVE_GENERIC_MMU_GATHER
190
191static bool tlb_next_batch(struct mmu_gather *tlb)
192{
193 struct mmu_gather_batch *batch;
194
195 batch = tlb->active;
196 if (batch->next) {
197 tlb->active = batch->next;
198 return true;
199 }
200
201 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
202 return false;
203
204 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
205 if (!batch)
206 return false;
207
208 tlb->batch_count++;
209 batch->next = NULL;
210 batch->nr = 0;
211 batch->max = MAX_GATHER_BATCH;
212
213 tlb->active->next = batch;
214 tlb->active = batch;
215
216 return true;
217}
218
219void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
220 unsigned long start, unsigned long end)
221{
222 tlb->mm = mm;
223
224
225 tlb->fullmm = !(start | (end+1));
226 tlb->need_flush_all = 0;
227 tlb->local.next = NULL;
228 tlb->local.nr = 0;
229 tlb->local.max = ARRAY_SIZE(tlb->__pages);
230 tlb->active = &tlb->local;
231 tlb->batch_count = 0;
232
233#ifdef CONFIG_HAVE_RCU_TABLE_FREE
234 tlb->batch = NULL;
235#endif
236 tlb->page_size = 0;
237
238 __tlb_reset_range(tlb);
239}
240
241static void tlb_flush_mmu_free(struct mmu_gather *tlb)
242{
243 struct mmu_gather_batch *batch;
244
245#ifdef CONFIG_HAVE_RCU_TABLE_FREE
246 tlb_table_flush(tlb);
247#endif
248 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
249 free_pages_and_swap_cache(batch->pages, batch->nr);
250 batch->nr = 0;
251 }
252 tlb->active = &tlb->local;
253}
254
255void tlb_flush_mmu(struct mmu_gather *tlb)
256{
257 tlb_flush_mmu_tlbonly(tlb);
258 tlb_flush_mmu_free(tlb);
259}
260
261
262
263
264
265void arch_tlb_finish_mmu(struct mmu_gather *tlb,
266 unsigned long start, unsigned long end, bool force)
267{
268 struct mmu_gather_batch *batch, *next;
269
270 if (force)
271 __tlb_adjust_range(tlb, start, end - start);
272
273 tlb_flush_mmu(tlb);
274
275
276 check_pgt_cache();
277
278 for (batch = tlb->local.next; batch; batch = next) {
279 next = batch->next;
280 free_pages((unsigned long)batch, 0);
281 }
282 tlb->local.next = NULL;
283}
284
285
286
287
288
289
290
291
292bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
293{
294 struct mmu_gather_batch *batch;
295
296 VM_BUG_ON(!tlb->end);
297 VM_WARN_ON(tlb->page_size != page_size);
298
299 batch = tlb->active;
300
301
302
303
304 batch->pages[batch->nr++] = page;
305 if (batch->nr == batch->max) {
306 if (!tlb_next_batch(tlb))
307 return true;
308 batch = tlb->active;
309 }
310 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
311
312 return false;
313}
314
315#endif
316
317#ifdef CONFIG_HAVE_RCU_TABLE_FREE
318
319
320
321
322
323
324
325
326static inline void tlb_table_invalidate(struct mmu_gather *tlb)
327{
328#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
329
330
331
332
333
334 tlb_flush_mmu_tlbonly(tlb);
335#endif
336}
337
338static void tlb_remove_table_smp_sync(void *arg)
339{
340
341}
342
343static void tlb_remove_table_one(void *table)
344{
345
346
347
348
349
350
351
352 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
353 __tlb_remove_table(table);
354}
355
356static void tlb_remove_table_rcu(struct rcu_head *head)
357{
358 struct mmu_table_batch *batch;
359 int i;
360
361 batch = container_of(head, struct mmu_table_batch, rcu);
362
363 for (i = 0; i < batch->nr; i++)
364 __tlb_remove_table(batch->tables[i]);
365
366 free_page((unsigned long)batch);
367}
368
369void tlb_table_flush(struct mmu_gather *tlb)
370{
371 struct mmu_table_batch **batch = &tlb->batch;
372
373 if (*batch) {
374 tlb_table_invalidate(tlb);
375 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
376 *batch = NULL;
377 }
378}
379
380void tlb_remove_table(struct mmu_gather *tlb, void *table)
381{
382 struct mmu_table_batch **batch = &tlb->batch;
383
384 if (*batch == NULL) {
385 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
386 if (*batch == NULL) {
387 tlb_table_invalidate(tlb);
388 tlb_remove_table_one(table);
389 return;
390 }
391 (*batch)->nr = 0;
392 }
393
394 (*batch)->tables[(*batch)->nr++] = table;
395 if ((*batch)->nr == MAX_TABLE_BATCH)
396 tlb_table_flush(tlb);
397}
398
399#endif
400
401
402
403
404
405
406
407
408
409
410
411
412
413void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
414 unsigned long start, unsigned long end)
415{
416 arch_tlb_gather_mmu(tlb, mm, start, end);
417 inc_tlb_flush_pending(tlb->mm);
418}
419
420void tlb_finish_mmu(struct mmu_gather *tlb,
421 unsigned long start, unsigned long end)
422{
423
424
425
426
427
428
429
430 bool force = mm_tlb_flush_nested(tlb->mm);
431
432 arch_tlb_finish_mmu(tlb, start, end, force);
433 dec_tlb_flush_pending(tlb->mm);
434}
435
436
437
438
439
440static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
441 unsigned long addr)
442{
443 pgtable_t token = pmd_pgtable(*pmd);
444 pmd_clear(pmd);
445 pte_free_tlb(tlb, token, addr);
446 mm_dec_nr_ptes(tlb->mm);
447}
448
449static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
450 unsigned long addr, unsigned long end,
451 unsigned long floor, unsigned long ceiling)
452{
453 pmd_t *pmd;
454 unsigned long next;
455 unsigned long start;
456
457 start = addr;
458 pmd = pmd_offset(pud, addr);
459 do {
460 next = pmd_addr_end(addr, end);
461 if (pmd_none_or_clear_bad(pmd))
462 continue;
463 free_pte_range(tlb, pmd, addr);
464 } while (pmd++, addr = next, addr != end);
465
466 start &= PUD_MASK;
467 if (start < floor)
468 return;
469 if (ceiling) {
470 ceiling &= PUD_MASK;
471 if (!ceiling)
472 return;
473 }
474 if (end - 1 > ceiling - 1)
475 return;
476
477 pmd = pmd_offset(pud, start);
478 pud_clear(pud);
479 pmd_free_tlb(tlb, pmd, start);
480 mm_dec_nr_pmds(tlb->mm);
481}
482
483static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
484 unsigned long addr, unsigned long end,
485 unsigned long floor, unsigned long ceiling)
486{
487 pud_t *pud;
488 unsigned long next;
489 unsigned long start;
490
491 start = addr;
492 pud = pud_offset(p4d, addr);
493 do {
494 next = pud_addr_end(addr, end);
495 if (pud_none_or_clear_bad(pud))
496 continue;
497 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
498 } while (pud++, addr = next, addr != end);
499
500 start &= P4D_MASK;
501 if (start < floor)
502 return;
503 if (ceiling) {
504 ceiling &= P4D_MASK;
505 if (!ceiling)
506 return;
507 }
508 if (end - 1 > ceiling - 1)
509 return;
510
511 pud = pud_offset(p4d, start);
512 p4d_clear(p4d);
513 pud_free_tlb(tlb, pud, start);
514 mm_dec_nr_puds(tlb->mm);
515}
516
517static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
518 unsigned long addr, unsigned long end,
519 unsigned long floor, unsigned long ceiling)
520{
521 p4d_t *p4d;
522 unsigned long next;
523 unsigned long start;
524
525 start = addr;
526 p4d = p4d_offset(pgd, addr);
527 do {
528 next = p4d_addr_end(addr, end);
529 if (p4d_none_or_clear_bad(p4d))
530 continue;
531 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
532 } while (p4d++, addr = next, addr != end);
533
534 start &= PGDIR_MASK;
535 if (start < floor)
536 return;
537 if (ceiling) {
538 ceiling &= PGDIR_MASK;
539 if (!ceiling)
540 return;
541 }
542 if (end - 1 > ceiling - 1)
543 return;
544
545 p4d = p4d_offset(pgd, start);
546 pgd_clear(pgd);
547 p4d_free_tlb(tlb, p4d, start);
548}
549
550
551
552
553void free_pgd_range(struct mmu_gather *tlb,
554 unsigned long addr, unsigned long end,
555 unsigned long floor, unsigned long ceiling)
556{
557 pgd_t *pgd;
558 unsigned long next;
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586 addr &= PMD_MASK;
587 if (addr < floor) {
588 addr += PMD_SIZE;
589 if (!addr)
590 return;
591 }
592 if (ceiling) {
593 ceiling &= PMD_MASK;
594 if (!ceiling)
595 return;
596 }
597 if (end - 1 > ceiling - 1)
598 end -= PMD_SIZE;
599 if (addr > end - 1)
600 return;
601
602
603
604
605 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
606 pgd = pgd_offset(tlb->mm, addr);
607 do {
608 next = pgd_addr_end(addr, end);
609 if (pgd_none_or_clear_bad(pgd))
610 continue;
611 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
612 } while (pgd++, addr = next, addr != end);
613}
614
615void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
616 unsigned long floor, unsigned long ceiling)
617{
618 while (vma) {
619 struct vm_area_struct *next = vma->vm_next;
620 unsigned long addr = vma->vm_start;
621
622
623
624
625
626 unlink_anon_vmas(vma);
627 unlink_file_vma(vma);
628
629 if (is_vm_hugetlb_page(vma)) {
630 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
631 floor, next ? next->vm_start : ceiling);
632 } else {
633
634
635
636 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
637 && !is_vm_hugetlb_page(next)) {
638 vma = next;
639 next = vma->vm_next;
640 unlink_anon_vmas(vma);
641 unlink_file_vma(vma);
642 }
643 free_pgd_range(tlb, addr, vma->vm_end,
644 floor, next ? next->vm_start : ceiling);
645 }
646 vma = next;
647 }
648}
649
650int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
651{
652 spinlock_t *ptl;
653 pgtable_t new = pte_alloc_one(mm, address);
654 if (!new)
655 return -ENOMEM;
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670 smp_wmb();
671
672 ptl = pmd_lock(mm, pmd);
673 if (likely(pmd_none(*pmd))) {
674 mm_inc_nr_ptes(mm);
675 pmd_populate(mm, pmd, new);
676 new = NULL;
677 }
678 spin_unlock(ptl);
679 if (new)
680 pte_free(mm, new);
681 return 0;
682}
683
684int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
685{
686 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
687 if (!new)
688 return -ENOMEM;
689
690 smp_wmb();
691
692 spin_lock(&init_mm.page_table_lock);
693 if (likely(pmd_none(*pmd))) {
694 pmd_populate_kernel(&init_mm, pmd, new);
695 new = NULL;
696 }
697 spin_unlock(&init_mm.page_table_lock);
698 if (new)
699 pte_free_kernel(&init_mm, new);
700 return 0;
701}
702
703static inline void init_rss_vec(int *rss)
704{
705 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
706}
707
708static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
709{
710 int i;
711
712 if (current->mm == mm)
713 sync_mm_rss(mm);
714 for (i = 0; i < NR_MM_COUNTERS; i++)
715 if (rss[i])
716 add_mm_counter(mm, i, rss[i]);
717}
718
719
720
721
722
723
724
725
726static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
727 pte_t pte, struct page *page)
728{
729 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
730 p4d_t *p4d = p4d_offset(pgd, addr);
731 pud_t *pud = pud_offset(p4d, addr);
732 pmd_t *pmd = pmd_offset(pud, addr);
733 struct address_space *mapping;
734 pgoff_t index;
735 static unsigned long resume;
736 static unsigned long nr_shown;
737 static unsigned long nr_unshown;
738
739
740
741
742
743 if (nr_shown == 60) {
744 if (time_before(jiffies, resume)) {
745 nr_unshown++;
746 return;
747 }
748 if (nr_unshown) {
749 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
750 nr_unshown);
751 nr_unshown = 0;
752 }
753 nr_shown = 0;
754 }
755 if (nr_shown++ == 0)
756 resume = jiffies + 60 * HZ;
757
758 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
759 index = linear_page_index(vma, addr);
760
761 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
762 current->comm,
763 (long long)pte_val(pte), (long long)pmd_val(*pmd));
764 if (page)
765 dump_page(page, "bad pte");
766 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
767 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
768 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
769 vma->vm_file,
770 vma->vm_ops ? vma->vm_ops->fault : NULL,
771 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
772 mapping ? mapping->a_ops->readpage : NULL);
773 dump_stack();
774 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
775}
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
820 pte_t pte, bool with_public_device)
821{
822 unsigned long pfn = pte_pfn(pte);
823
824 if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
825 if (likely(!pte_special(pte)))
826 goto check_pfn;
827 if (vma->vm_ops && vma->vm_ops->find_special_page)
828 return vma->vm_ops->find_special_page(vma, addr);
829 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
830 return NULL;
831 if (is_zero_pfn(pfn))
832 return NULL;
833
834
835
836
837
838
839
840
841
842
843
844
845
846 if (likely(pfn <= highest_memmap_pfn)) {
847 struct page *page = pfn_to_page(pfn);
848
849 if (is_device_public_page(page)) {
850 if (with_public_device)
851 return page;
852 return NULL;
853 }
854 }
855
856 if (pte_devmap(pte))
857 return NULL;
858
859 print_bad_pte(vma, addr, pte, NULL);
860 return NULL;
861 }
862
863
864
865 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
866 if (vma->vm_flags & VM_MIXEDMAP) {
867 if (!pfn_valid(pfn))
868 return NULL;
869 goto out;
870 } else {
871 unsigned long off;
872 off = (addr - vma->vm_start) >> PAGE_SHIFT;
873 if (pfn == vma->vm_pgoff + off)
874 return NULL;
875 if (!is_cow_mapping(vma->vm_flags))
876 return NULL;
877 }
878 }
879
880 if (is_zero_pfn(pfn))
881 return NULL;
882
883check_pfn:
884 if (unlikely(pfn > highest_memmap_pfn)) {
885 print_bad_pte(vma, addr, pte, NULL);
886 return NULL;
887 }
888
889
890
891
892
893out:
894 return pfn_to_page(pfn);
895}
896
897#ifdef CONFIG_TRANSPARENT_HUGEPAGE
898struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
899 pmd_t pmd)
900{
901 unsigned long pfn = pmd_pfn(pmd);
902
903
904
905
906
907
908 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
909 if (vma->vm_flags & VM_MIXEDMAP) {
910 if (!pfn_valid(pfn))
911 return NULL;
912 goto out;
913 } else {
914 unsigned long off;
915 off = (addr - vma->vm_start) >> PAGE_SHIFT;
916 if (pfn == vma->vm_pgoff + off)
917 return NULL;
918 if (!is_cow_mapping(vma->vm_flags))
919 return NULL;
920 }
921 }
922
923 if (pmd_devmap(pmd))
924 return NULL;
925 if (is_zero_pfn(pfn))
926 return NULL;
927 if (unlikely(pfn > highest_memmap_pfn))
928 return NULL;
929
930
931
932
933
934out:
935 return pfn_to_page(pfn);
936}
937#endif
938
939
940
941
942
943
944
945static inline unsigned long
946copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
947 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
948 unsigned long addr, int *rss)
949{
950 unsigned long vm_flags = vma->vm_flags;
951 pte_t pte = *src_pte;
952 struct page *page;
953
954
955 if (unlikely(!pte_present(pte))) {
956 swp_entry_t entry = pte_to_swp_entry(pte);
957
958 if (likely(!non_swap_entry(entry))) {
959 if (swap_duplicate(entry) < 0)
960 return entry.val;
961
962
963 if (unlikely(list_empty(&dst_mm->mmlist))) {
964 spin_lock(&mmlist_lock);
965 if (list_empty(&dst_mm->mmlist))
966 list_add(&dst_mm->mmlist,
967 &src_mm->mmlist);
968 spin_unlock(&mmlist_lock);
969 }
970 rss[MM_SWAPENTS]++;
971 } else if (is_migration_entry(entry)) {
972 page = migration_entry_to_page(entry);
973
974 rss[mm_counter(page)]++;
975
976 if (is_write_migration_entry(entry) &&
977 is_cow_mapping(vm_flags)) {
978
979
980
981
982 make_migration_entry_read(&entry);
983 pte = swp_entry_to_pte(entry);
984 if (pte_swp_soft_dirty(*src_pte))
985 pte = pte_swp_mksoft_dirty(pte);
986 set_pte_at(src_mm, addr, src_pte, pte);
987 }
988 } else if (is_device_private_entry(entry)) {
989 page = device_private_entry_to_page(entry);
990
991
992
993
994
995
996
997
998
999
1000 get_page(page);
1001 rss[mm_counter(page)]++;
1002 page_dup_rmap(page, false);
1003
1004
1005
1006
1007
1008
1009
1010
1011 if (is_write_device_private_entry(entry) &&
1012 is_cow_mapping(vm_flags)) {
1013 make_device_private_entry_read(&entry);
1014 pte = swp_entry_to_pte(entry);
1015 set_pte_at(src_mm, addr, src_pte, pte);
1016 }
1017 }
1018 goto out_set_pte;
1019 }
1020
1021
1022
1023
1024
1025 if (is_cow_mapping(vm_flags) && pte_write(pte)) {
1026 ptep_set_wrprotect(src_mm, addr, src_pte);
1027 pte = pte_wrprotect(pte);
1028 }
1029
1030
1031
1032
1033
1034 if (vm_flags & VM_SHARED)
1035 pte = pte_mkclean(pte);
1036 pte = pte_mkold(pte);
1037
1038 page = vm_normal_page(vma, addr, pte);
1039 if (page) {
1040 get_page(page);
1041 page_dup_rmap(page, false);
1042 rss[mm_counter(page)]++;
1043 } else if (pte_devmap(pte)) {
1044 page = pte_page(pte);
1045
1046
1047
1048
1049
1050
1051 if (is_device_public_page(page)) {
1052 get_page(page);
1053 page_dup_rmap(page, false);
1054 rss[mm_counter(page)]++;
1055 }
1056 }
1057
1058out_set_pte:
1059 set_pte_at(dst_mm, addr, dst_pte, pte);
1060 return 0;
1061}
1062
1063static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1064 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
1065 unsigned long addr, unsigned long end)
1066{
1067 pte_t *orig_src_pte, *orig_dst_pte;
1068 pte_t *src_pte, *dst_pte;
1069 spinlock_t *src_ptl, *dst_ptl;
1070 int progress = 0;
1071 int rss[NR_MM_COUNTERS];
1072 swp_entry_t entry = (swp_entry_t){0};
1073
1074again:
1075 init_rss_vec(rss);
1076
1077 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1078 if (!dst_pte)
1079 return -ENOMEM;
1080 src_pte = pte_offset_map(src_pmd, addr);
1081 src_ptl = pte_lockptr(src_mm, src_pmd);
1082 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1083 orig_src_pte = src_pte;
1084 orig_dst_pte = dst_pte;
1085 arch_enter_lazy_mmu_mode();
1086
1087 do {
1088
1089
1090
1091
1092 if (progress >= 32) {
1093 progress = 0;
1094 if (need_resched() ||
1095 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
1096 break;
1097 }
1098 if (pte_none(*src_pte)) {
1099 progress++;
1100 continue;
1101 }
1102 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
1103 vma, addr, rss);
1104 if (entry.val)
1105 break;
1106 progress += 8;
1107 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1108
1109 arch_leave_lazy_mmu_mode();
1110 spin_unlock(src_ptl);
1111 pte_unmap(orig_src_pte);
1112 add_mm_rss_vec(dst_mm, rss);
1113 pte_unmap_unlock(orig_dst_pte, dst_ptl);
1114 cond_resched();
1115
1116 if (entry.val) {
1117 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
1118 return -ENOMEM;
1119 progress = 0;
1120 }
1121 if (addr != end)
1122 goto again;
1123 return 0;
1124}
1125
1126static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1127 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
1128 unsigned long addr, unsigned long end)
1129{
1130 pmd_t *src_pmd, *dst_pmd;
1131 unsigned long next;
1132
1133 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1134 if (!dst_pmd)
1135 return -ENOMEM;
1136 src_pmd = pmd_offset(src_pud, addr);
1137 do {
1138 next = pmd_addr_end(addr, end);
1139 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
1140 || pmd_devmap(*src_pmd)) {
1141 int err;
1142 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1143 err = copy_huge_pmd(dst_mm, src_mm,
1144 dst_pmd, src_pmd, addr, vma);
1145 if (err == -ENOMEM)
1146 return -ENOMEM;
1147 if (!err)
1148 continue;
1149
1150 }
1151 if (pmd_none_or_clear_bad(src_pmd))
1152 continue;
1153 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1154 vma, addr, next))
1155 return -ENOMEM;
1156 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1157 return 0;
1158}
1159
1160static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1161 p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
1162 unsigned long addr, unsigned long end)
1163{
1164 pud_t *src_pud, *dst_pud;
1165 unsigned long next;
1166
1167 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1168 if (!dst_pud)
1169 return -ENOMEM;
1170 src_pud = pud_offset(src_p4d, addr);
1171 do {
1172 next = pud_addr_end(addr, end);
1173 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1174 int err;
1175
1176 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
1177 err = copy_huge_pud(dst_mm, src_mm,
1178 dst_pud, src_pud, addr, vma);
1179 if (err == -ENOMEM)
1180 return -ENOMEM;
1181 if (!err)
1182 continue;
1183
1184 }
1185 if (pud_none_or_clear_bad(src_pud))
1186 continue;
1187 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1188 vma, addr, next))
1189 return -ENOMEM;
1190 } while (dst_pud++, src_pud++, addr = next, addr != end);
1191 return 0;
1192}
1193
1194static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1195 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1196 unsigned long addr, unsigned long end)
1197{
1198 p4d_t *src_p4d, *dst_p4d;
1199 unsigned long next;
1200
1201 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1202 if (!dst_p4d)
1203 return -ENOMEM;
1204 src_p4d = p4d_offset(src_pgd, addr);
1205 do {
1206 next = p4d_addr_end(addr, end);
1207 if (p4d_none_or_clear_bad(src_p4d))
1208 continue;
1209 if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
1210 vma, addr, next))
1211 return -ENOMEM;
1212 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
1213 return 0;
1214}
1215
1216int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1217 struct vm_area_struct *vma)
1218{
1219 pgd_t *src_pgd, *dst_pgd;
1220 unsigned long next;
1221 unsigned long addr = vma->vm_start;
1222 unsigned long end = vma->vm_end;
1223 unsigned long mmun_start;
1224 unsigned long mmun_end;
1225 bool is_cow;
1226 int ret;
1227
1228
1229
1230
1231
1232
1233
1234 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1235 !vma->anon_vma)
1236 return 0;
1237
1238 if (is_vm_hugetlb_page(vma))
1239 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1240
1241 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1242
1243
1244
1245
1246 ret = track_pfn_copy(vma);
1247 if (ret)
1248 return ret;
1249 }
1250
1251
1252
1253
1254
1255
1256
1257 is_cow = is_cow_mapping(vma->vm_flags);
1258 mmun_start = addr;
1259 mmun_end = end;
1260 if (is_cow)
1261 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1262 mmun_end);
1263
1264 ret = 0;
1265 dst_pgd = pgd_offset(dst_mm, addr);
1266 src_pgd = pgd_offset(src_mm, addr);
1267 do {
1268 next = pgd_addr_end(addr, end);
1269 if (pgd_none_or_clear_bad(src_pgd))
1270 continue;
1271 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
1272 vma, addr, next))) {
1273 ret = -ENOMEM;
1274 break;
1275 }
1276 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1277
1278 if (is_cow)
1279 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1280 return ret;
1281}
1282
1283static unsigned long zap_pte_range(struct mmu_gather *tlb,
1284 struct vm_area_struct *vma, pmd_t *pmd,
1285 unsigned long addr, unsigned long end,
1286 struct zap_details *details)
1287{
1288 struct mm_struct *mm = tlb->mm;
1289 int force_flush = 0;
1290 int rss[NR_MM_COUNTERS];
1291 spinlock_t *ptl;
1292 pte_t *start_pte;
1293 pte_t *pte;
1294 swp_entry_t entry;
1295
1296 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
1297again:
1298 init_rss_vec(rss);
1299 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1300 pte = start_pte;
1301 flush_tlb_batched_pending(mm);
1302 arch_enter_lazy_mmu_mode();
1303 do {
1304 pte_t ptent = *pte;
1305 if (pte_none(ptent))
1306 continue;
1307
1308 if (pte_present(ptent)) {
1309 struct page *page;
1310
1311 page = _vm_normal_page(vma, addr, ptent, true);
1312 if (unlikely(details) && page) {
1313
1314
1315
1316
1317
1318 if (details->check_mapping &&
1319 details->check_mapping != page_rmapping(page))
1320 continue;
1321 }
1322 ptent = ptep_get_and_clear_full(mm, addr, pte,
1323 tlb->fullmm);
1324 tlb_remove_tlb_entry(tlb, pte, addr);
1325 if (unlikely(!page))
1326 continue;
1327
1328 if (!PageAnon(page)) {
1329 if (pte_dirty(ptent)) {
1330 force_flush = 1;
1331 set_page_dirty(page);
1332 }
1333 if (pte_young(ptent) &&
1334 likely(!(vma->vm_flags & VM_SEQ_READ)))
1335 mark_page_accessed(page);
1336 }
1337 rss[mm_counter(page)]--;
1338 page_remove_rmap(page, false);
1339 if (unlikely(page_mapcount(page) < 0))
1340 print_bad_pte(vma, addr, ptent, page);
1341 if (unlikely(__tlb_remove_page(tlb, page))) {
1342 force_flush = 1;
1343 addr += PAGE_SIZE;
1344 break;
1345 }
1346 continue;
1347 }
1348
1349 entry = pte_to_swp_entry(ptent);
1350 if (non_swap_entry(entry) && is_device_private_entry(entry)) {
1351 struct page *page = device_private_entry_to_page(entry);
1352
1353 if (unlikely(details && details->check_mapping)) {
1354
1355
1356
1357
1358
1359 if (details->check_mapping !=
1360 page_rmapping(page))
1361 continue;
1362 }
1363
1364 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1365 rss[mm_counter(page)]--;
1366 page_remove_rmap(page, false);
1367 put_page(page);
1368 continue;
1369 }
1370
1371
1372 if (unlikely(details))
1373 continue;
1374
1375 entry = pte_to_swp_entry(ptent);
1376 if (!non_swap_entry(entry))
1377 rss[MM_SWAPENTS]--;
1378 else if (is_migration_entry(entry)) {
1379 struct page *page;
1380
1381 page = migration_entry_to_page(entry);
1382 rss[mm_counter(page)]--;
1383 }
1384 if (unlikely(!free_swap_and_cache(entry)))
1385 print_bad_pte(vma, addr, ptent, NULL);
1386 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1387 } while (pte++, addr += PAGE_SIZE, addr != end);
1388
1389 add_mm_rss_vec(mm, rss);
1390 arch_leave_lazy_mmu_mode();
1391
1392
1393 if (force_flush)
1394 tlb_flush_mmu_tlbonly(tlb);
1395 pte_unmap_unlock(start_pte, ptl);
1396
1397
1398
1399
1400
1401
1402
1403 if (force_flush) {
1404 force_flush = 0;
1405 tlb_flush_mmu_free(tlb);
1406 if (addr != end)
1407 goto again;
1408 }
1409
1410 return addr;
1411}
1412
1413static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1414 struct vm_area_struct *vma, pud_t *pud,
1415 unsigned long addr, unsigned long end,
1416 struct zap_details *details)
1417{
1418 pmd_t *pmd;
1419 unsigned long next;
1420
1421 pmd = pmd_offset(pud, addr);
1422 do {
1423 next = pmd_addr_end(addr, end);
1424 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1425 if (next - addr != HPAGE_PMD_SIZE)
1426 __split_huge_pmd(vma, pmd, addr, false, NULL);
1427 else if (zap_huge_pmd(tlb, vma, pmd, addr))
1428 goto next;
1429
1430 }
1431
1432
1433
1434
1435
1436
1437
1438 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1439 goto next;
1440 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1441next:
1442 cond_resched();
1443 } while (pmd++, addr = next, addr != end);
1444
1445 return addr;
1446}
1447
1448static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1449 struct vm_area_struct *vma, p4d_t *p4d,
1450 unsigned long addr, unsigned long end,
1451 struct zap_details *details)
1452{
1453 pud_t *pud;
1454 unsigned long next;
1455
1456 pud = pud_offset(p4d, addr);
1457 do {
1458 next = pud_addr_end(addr, end);
1459 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1460 if (next - addr != HPAGE_PUD_SIZE) {
1461 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1462 split_huge_pud(vma, pud, addr);
1463 } else if (zap_huge_pud(tlb, vma, pud, addr))
1464 goto next;
1465
1466 }
1467 if (pud_none_or_clear_bad(pud))
1468 continue;
1469 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1470next:
1471 cond_resched();
1472 } while (pud++, addr = next, addr != end);
1473
1474 return addr;
1475}
1476
1477static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1478 struct vm_area_struct *vma, pgd_t *pgd,
1479 unsigned long addr, unsigned long end,
1480 struct zap_details *details)
1481{
1482 p4d_t *p4d;
1483 unsigned long next;
1484
1485 p4d = p4d_offset(pgd, addr);
1486 do {
1487 next = p4d_addr_end(addr, end);
1488 if (p4d_none_or_clear_bad(p4d))
1489 continue;
1490 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1491 } while (p4d++, addr = next, addr != end);
1492
1493 return addr;
1494}
1495
1496void unmap_page_range(struct mmu_gather *tlb,
1497 struct vm_area_struct *vma,
1498 unsigned long addr, unsigned long end,
1499 struct zap_details *details)
1500{
1501 pgd_t *pgd;
1502 unsigned long next;
1503
1504 BUG_ON(addr >= end);
1505 tlb_start_vma(tlb, vma);
1506 pgd = pgd_offset(vma->vm_mm, addr);
1507 do {
1508 next = pgd_addr_end(addr, end);
1509 if (pgd_none_or_clear_bad(pgd))
1510 continue;
1511 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1512 } while (pgd++, addr = next, addr != end);
1513 tlb_end_vma(tlb, vma);
1514}
1515
1516
1517static void unmap_single_vma(struct mmu_gather *tlb,
1518 struct vm_area_struct *vma, unsigned long start_addr,
1519 unsigned long end_addr,
1520 struct zap_details *details)
1521{
1522 unsigned long start = max(vma->vm_start, start_addr);
1523 unsigned long end;
1524
1525 if (start >= vma->vm_end)
1526 return;
1527 end = min(vma->vm_end, end_addr);
1528 if (end <= vma->vm_start)
1529 return;
1530
1531 if (vma->vm_file)
1532 uprobe_munmap(vma, start, end);
1533
1534 if (unlikely(vma->vm_flags & VM_PFNMAP))
1535 untrack_pfn(vma, 0, 0);
1536
1537 if (start != end) {
1538 if (unlikely(is_vm_hugetlb_page(vma))) {
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550 if (vma->vm_file) {
1551 i_mmap_lock_write(vma->vm_file->f_mapping);
1552 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1553 i_mmap_unlock_write(vma->vm_file->f_mapping);
1554 }
1555 } else
1556 unmap_page_range(tlb, vma, start, end, details);
1557 }
1558}
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578void unmap_vmas(struct mmu_gather *tlb,
1579 struct vm_area_struct *vma, unsigned long start_addr,
1580 unsigned long end_addr)
1581{
1582 struct mm_struct *mm = vma->vm_mm;
1583
1584 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1585 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1586 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1587 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1588}
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1599 unsigned long size)
1600{
1601 struct mm_struct *mm = vma->vm_mm;
1602 struct mmu_gather tlb;
1603 unsigned long end = start + size;
1604
1605 lru_add_drain();
1606 tlb_gather_mmu(&tlb, mm, start, end);
1607 update_hiwater_rss(mm);
1608 mmu_notifier_invalidate_range_start(mm, start, end);
1609 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1610 unmap_single_vma(&tlb, vma, start, end, NULL);
1611 mmu_notifier_invalidate_range_end(mm, start, end);
1612 tlb_finish_mmu(&tlb, start, end);
1613}
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1625 unsigned long size, struct zap_details *details)
1626{
1627 struct mm_struct *mm = vma->vm_mm;
1628 struct mmu_gather tlb;
1629 unsigned long end = address + size;
1630
1631 lru_add_drain();
1632 tlb_gather_mmu(&tlb, mm, address, end);
1633 update_hiwater_rss(mm);
1634 mmu_notifier_invalidate_range_start(mm, address, end);
1635 unmap_single_vma(&tlb, vma, address, end, details);
1636 mmu_notifier_invalidate_range_end(mm, address, end);
1637 tlb_finish_mmu(&tlb, address, end);
1638}
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1652 unsigned long size)
1653{
1654 if (address < vma->vm_start || address + size > vma->vm_end ||
1655 !(vma->vm_flags & VM_PFNMAP))
1656 return;
1657
1658 zap_page_range_single(vma, address, size, NULL);
1659}
1660EXPORT_SYMBOL_GPL(zap_vma_ptes);
1661
1662pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1663 spinlock_t **ptl)
1664{
1665 pgd_t *pgd;
1666 p4d_t *p4d;
1667 pud_t *pud;
1668 pmd_t *pmd;
1669
1670 pgd = pgd_offset(mm, addr);
1671 p4d = p4d_alloc(mm, pgd, addr);
1672 if (!p4d)
1673 return NULL;
1674 pud = pud_alloc(mm, p4d, addr);
1675 if (!pud)
1676 return NULL;
1677 pmd = pmd_alloc(mm, pud, addr);
1678 if (!pmd)
1679 return NULL;
1680
1681 VM_BUG_ON(pmd_trans_huge(*pmd));
1682 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1683}
1684
1685
1686
1687
1688
1689
1690
1691
1692static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1693 struct page *page, pgprot_t prot)
1694{
1695 struct mm_struct *mm = vma->vm_mm;
1696 int retval;
1697 pte_t *pte;
1698 spinlock_t *ptl;
1699
1700 retval = -EINVAL;
1701 if (PageAnon(page))
1702 goto out;
1703 retval = -ENOMEM;
1704 flush_dcache_page(page);
1705 pte = get_locked_pte(mm, addr, &ptl);
1706 if (!pte)
1707 goto out;
1708 retval = -EBUSY;
1709 if (!pte_none(*pte))
1710 goto out_unlock;
1711
1712
1713 get_page(page);
1714 inc_mm_counter_fast(mm, mm_counter_file(page));
1715 page_add_file_rmap(page, false);
1716 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1717
1718 retval = 0;
1719 pte_unmap_unlock(pte, ptl);
1720 return retval;
1721out_unlock:
1722 pte_unmap_unlock(pte, ptl);
1723out:
1724 return retval;
1725}
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1755 struct page *page)
1756{
1757 if (addr < vma->vm_start || addr >= vma->vm_end)
1758 return -EFAULT;
1759 if (!page_count(page))
1760 return -EINVAL;
1761 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1762 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1763 BUG_ON(vma->vm_flags & VM_PFNMAP);
1764 vma->vm_flags |= VM_MIXEDMAP;
1765 }
1766 return insert_page(vma, addr, page, vma->vm_page_prot);
1767}
1768EXPORT_SYMBOL(vm_insert_page);
1769
1770static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1771 pfn_t pfn, pgprot_t prot, bool mkwrite)
1772{
1773 struct mm_struct *mm = vma->vm_mm;
1774 int retval;
1775 pte_t *pte, entry;
1776 spinlock_t *ptl;
1777
1778 retval = -ENOMEM;
1779 pte = get_locked_pte(mm, addr, &ptl);
1780 if (!pte)
1781 goto out;
1782 retval = -EBUSY;
1783 if (!pte_none(*pte)) {
1784 if (mkwrite) {
1785
1786
1787
1788
1789
1790
1791
1792 if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
1793 goto out_unlock;
1794 entry = *pte;
1795 goto out_mkwrite;
1796 } else
1797 goto out_unlock;
1798 }
1799
1800
1801 if (pfn_t_devmap(pfn))
1802 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1803 else
1804 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1805
1806out_mkwrite:
1807 if (mkwrite) {
1808 entry = pte_mkyoung(entry);
1809 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1810 }
1811
1812 set_pte_at(mm, addr, pte, entry);
1813 update_mmu_cache(vma, addr, pte);
1814
1815 retval = 0;
1816out_unlock:
1817 pte_unmap_unlock(pte, ptl);
1818out:
1819 return retval;
1820}
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1840 unsigned long pfn)
1841{
1842 return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1843}
1844EXPORT_SYMBOL(vm_insert_pfn);
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1862 unsigned long pfn, pgprot_t pgprot)
1863{
1864 int ret;
1865
1866
1867
1868
1869
1870
1871 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1872 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1873 (VM_PFNMAP|VM_MIXEDMAP));
1874 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1875 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1876
1877 if (addr < vma->vm_start || addr >= vma->vm_end)
1878 return -EFAULT;
1879
1880 if (!pfn_modify_allowed(pfn, pgprot))
1881 return -EACCES;
1882
1883 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1884
1885 ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1886 false);
1887
1888 return ret;
1889}
1890EXPORT_SYMBOL(vm_insert_pfn_prot);
1891
1892static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1893{
1894
1895 if (vma->vm_flags & VM_MIXEDMAP)
1896 return true;
1897 if (pfn_t_devmap(pfn))
1898 return true;
1899 if (pfn_t_special(pfn))
1900 return true;
1901 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
1902 return true;
1903 return false;
1904}
1905
1906static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1907 pfn_t pfn, bool mkwrite)
1908{
1909 pgprot_t pgprot = vma->vm_page_prot;
1910
1911 BUG_ON(!vm_mixed_ok(vma, pfn));
1912
1913 if (addr < vma->vm_start || addr >= vma->vm_end)
1914 return -EFAULT;
1915
1916 track_pfn_insert(vma, &pgprot, pfn);
1917
1918 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
1919 return -EACCES;
1920
1921
1922
1923
1924
1925
1926
1927
1928 if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
1929 !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1930 struct page *page;
1931
1932
1933
1934
1935
1936
1937 page = pfn_to_page(pfn_t_to_pfn(pfn));
1938 return insert_page(vma, addr, page, pgprot);
1939 }
1940 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1941}
1942
1943int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1944 pfn_t pfn)
1945{
1946 return __vm_insert_mixed(vma, addr, pfn, false);
1947
1948}
1949EXPORT_SYMBOL(vm_insert_mixed);
1950
1951
1952
1953
1954
1955
1956
1957vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
1958 unsigned long addr, pfn_t pfn)
1959{
1960 int err;
1961
1962 err = __vm_insert_mixed(vma, addr, pfn, true);
1963 if (err == -ENOMEM)
1964 return VM_FAULT_OOM;
1965 if (err < 0 && err != -EBUSY)
1966 return VM_FAULT_SIGBUS;
1967 return VM_FAULT_NOPAGE;
1968}
1969EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
1970
1971
1972
1973
1974
1975
1976static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1977 unsigned long addr, unsigned long end,
1978 unsigned long pfn, pgprot_t prot)
1979{
1980 pte_t *pte;
1981 spinlock_t *ptl;
1982 int err = 0;
1983
1984 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1985 if (!pte)
1986 return -ENOMEM;
1987 arch_enter_lazy_mmu_mode();
1988 do {
1989 BUG_ON(!pte_none(*pte));
1990 if (!pfn_modify_allowed(pfn, prot)) {
1991 err = -EACCES;
1992 break;
1993 }
1994 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1995 pfn++;
1996 } while (pte++, addr += PAGE_SIZE, addr != end);
1997 arch_leave_lazy_mmu_mode();
1998 pte_unmap_unlock(pte - 1, ptl);
1999 return err;
2000}
2001
2002static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2003 unsigned long addr, unsigned long end,
2004 unsigned long pfn, pgprot_t prot)
2005{
2006 pmd_t *pmd;
2007 unsigned long next;
2008 int err;
2009
2010 pfn -= addr >> PAGE_SHIFT;
2011 pmd = pmd_alloc(mm, pud, addr);
2012 if (!pmd)
2013 return -ENOMEM;
2014 VM_BUG_ON(pmd_trans_huge(*pmd));
2015 do {
2016 next = pmd_addr_end(addr, end);
2017 err = remap_pte_range(mm, pmd, addr, next,
2018 pfn + (addr >> PAGE_SHIFT), prot);
2019 if (err)
2020 return err;
2021 } while (pmd++, addr = next, addr != end);
2022 return 0;
2023}
2024
2025static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
2026 unsigned long addr, unsigned long end,
2027 unsigned long pfn, pgprot_t prot)
2028{
2029 pud_t *pud;
2030 unsigned long next;
2031 int err;
2032
2033 pfn -= addr >> PAGE_SHIFT;
2034 pud = pud_alloc(mm, p4d, addr);
2035 if (!pud)
2036 return -ENOMEM;
2037 do {
2038 next = pud_addr_end(addr, end);
2039 err = remap_pmd_range(mm, pud, addr, next,
2040 pfn + (addr >> PAGE_SHIFT), prot);
2041 if (err)
2042 return err;
2043 } while (pud++, addr = next, addr != end);
2044 return 0;
2045}
2046
2047static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2048 unsigned long addr, unsigned long end,
2049 unsigned long pfn, pgprot_t prot)
2050{
2051 p4d_t *p4d;
2052 unsigned long next;
2053 int err;
2054
2055 pfn -= addr >> PAGE_SHIFT;
2056 p4d = p4d_alloc(mm, pgd, addr);
2057 if (!p4d)
2058 return -ENOMEM;
2059 do {
2060 next = p4d_addr_end(addr, end);
2061 err = remap_pud_range(mm, p4d, addr, next,
2062 pfn + (addr >> PAGE_SHIFT), prot);
2063 if (err)
2064 return err;
2065 } while (p4d++, addr = next, addr != end);
2066 return 0;
2067}
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2080 unsigned long pfn, unsigned long size, pgprot_t prot)
2081{
2082 pgd_t *pgd;
2083 unsigned long next;
2084 unsigned long end = addr + PAGE_ALIGN(size);
2085 struct mm_struct *mm = vma->vm_mm;
2086 unsigned long remap_pfn = pfn;
2087 int err;
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107 if (is_cow_mapping(vma->vm_flags)) {
2108 if (addr != vma->vm_start || end != vma->vm_end)
2109 return -EINVAL;
2110 vma->vm_pgoff = pfn;
2111 }
2112
2113 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
2114 if (err)
2115 return -EINVAL;
2116
2117 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2118
2119 BUG_ON(addr >= end);
2120 pfn -= addr >> PAGE_SHIFT;
2121 pgd = pgd_offset(mm, addr);
2122 flush_cache_range(vma, addr, end);
2123 do {
2124 next = pgd_addr_end(addr, end);
2125 err = remap_p4d_range(mm, pgd, addr, next,
2126 pfn + (addr >> PAGE_SHIFT), prot);
2127 if (err)
2128 break;
2129 } while (pgd++, addr = next, addr != end);
2130
2131 if (err)
2132 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
2133
2134 return err;
2135}
2136EXPORT_SYMBOL(remap_pfn_range);
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2152{
2153 unsigned long vm_len, pfn, pages;
2154
2155
2156 if (start + len < start)
2157 return -EINVAL;
2158
2159
2160
2161
2162
2163 len += start & ~PAGE_MASK;
2164 pfn = start >> PAGE_SHIFT;
2165 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2166 if (pfn + pages < pfn)
2167 return -EINVAL;
2168
2169
2170 if (vma->vm_pgoff > pages)
2171 return -EINVAL;
2172 pfn += vma->vm_pgoff;
2173 pages -= vma->vm_pgoff;
2174
2175
2176 vm_len = vma->vm_end - vma->vm_start;
2177 if (vm_len >> PAGE_SHIFT > pages)
2178 return -EINVAL;
2179
2180
2181 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2182}
2183EXPORT_SYMBOL(vm_iomap_memory);
2184
2185static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2186 unsigned long addr, unsigned long end,
2187 pte_fn_t fn, void *data)
2188{
2189 pte_t *pte;
2190 int err;
2191 pgtable_t token;
2192 spinlock_t *uninitialized_var(ptl);
2193
2194 pte = (mm == &init_mm) ?
2195 pte_alloc_kernel(pmd, addr) :
2196 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2197 if (!pte)
2198 return -ENOMEM;
2199
2200 BUG_ON(pmd_huge(*pmd));
2201
2202 arch_enter_lazy_mmu_mode();
2203
2204 token = pmd_pgtable(*pmd);
2205
2206 do {
2207 err = fn(pte++, token, addr, data);
2208 if (err)
2209 break;
2210 } while (addr += PAGE_SIZE, addr != end);
2211
2212 arch_leave_lazy_mmu_mode();
2213
2214 if (mm != &init_mm)
2215 pte_unmap_unlock(pte-1, ptl);
2216 return err;
2217}
2218
2219static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2220 unsigned long addr, unsigned long end,
2221 pte_fn_t fn, void *data)
2222{
2223 pmd_t *pmd;
2224 unsigned long next;
2225 int err;
2226
2227 BUG_ON(pud_huge(*pud));
2228
2229 pmd = pmd_alloc(mm, pud, addr);
2230 if (!pmd)
2231 return -ENOMEM;
2232 do {
2233 next = pmd_addr_end(addr, end);
2234 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2235 if (err)
2236 break;
2237 } while (pmd++, addr = next, addr != end);
2238 return err;
2239}
2240
2241static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2242 unsigned long addr, unsigned long end,
2243 pte_fn_t fn, void *data)
2244{
2245 pud_t *pud;
2246 unsigned long next;
2247 int err;
2248
2249 pud = pud_alloc(mm, p4d, addr);
2250 if (!pud)
2251 return -ENOMEM;
2252 do {
2253 next = pud_addr_end(addr, end);
2254 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2255 if (err)
2256 break;
2257 } while (pud++, addr = next, addr != end);
2258 return err;
2259}
2260
2261static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2262 unsigned long addr, unsigned long end,
2263 pte_fn_t fn, void *data)
2264{
2265 p4d_t *p4d;
2266 unsigned long next;
2267 int err;
2268
2269 p4d = p4d_alloc(mm, pgd, addr);
2270 if (!p4d)
2271 return -ENOMEM;
2272 do {
2273 next = p4d_addr_end(addr, end);
2274 err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
2275 if (err)
2276 break;
2277 } while (p4d++, addr = next, addr != end);
2278 return err;
2279}
2280
2281
2282
2283
2284
2285int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2286 unsigned long size, pte_fn_t fn, void *data)
2287{
2288 pgd_t *pgd;
2289 unsigned long next;
2290 unsigned long end = addr + size;
2291 int err;
2292
2293 if (WARN_ON(addr >= end))
2294 return -EINVAL;
2295
2296 pgd = pgd_offset(mm, addr);
2297 do {
2298 next = pgd_addr_end(addr, end);
2299 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2300 if (err)
2301 break;
2302 } while (pgd++, addr = next, addr != end);
2303
2304 return err;
2305}
2306EXPORT_SYMBOL_GPL(apply_to_page_range);
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2317 pte_t *page_table, pte_t orig_pte)
2318{
2319 int same = 1;
2320#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2321 if (sizeof(pte_t) > sizeof(unsigned long)) {
2322 spinlock_t *ptl = pte_lockptr(mm, pmd);
2323 spin_lock(ptl);
2324 same = pte_same(*page_table, orig_pte);
2325 spin_unlock(ptl);
2326 }
2327#endif
2328 pte_unmap(page_table);
2329 return same;
2330}
2331
2332static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2333{
2334 debug_dma_assert_idle(src);
2335
2336
2337
2338
2339
2340
2341
2342 if (unlikely(!src)) {
2343 void *kaddr = kmap_atomic(dst);
2344 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2345
2346
2347
2348
2349
2350
2351
2352 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2353 clear_page(kaddr);
2354 kunmap_atomic(kaddr);
2355 flush_dcache_page(dst);
2356 } else
2357 copy_user_highpage(dst, src, va, vma);
2358}
2359
2360static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2361{
2362 struct file *vm_file = vma->vm_file;
2363
2364 if (vm_file)
2365 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2366
2367
2368
2369
2370
2371 return GFP_KERNEL;
2372}
2373
2374
2375
2376
2377
2378
2379
2380static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2381{
2382 vm_fault_t ret;
2383 struct page *page = vmf->page;
2384 unsigned int old_flags = vmf->flags;
2385
2386 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2387
2388 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2389
2390 vmf->flags = old_flags;
2391 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2392 return ret;
2393 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2394 lock_page(page);
2395 if (!page->mapping) {
2396 unlock_page(page);
2397 return 0;
2398 }
2399 ret |= VM_FAULT_LOCKED;
2400 } else
2401 VM_BUG_ON_PAGE(!PageLocked(page), page);
2402 return ret;
2403}
2404
2405
2406
2407
2408
2409
2410static void fault_dirty_shared_page(struct vm_area_struct *vma,
2411 struct page *page)
2412{
2413 struct address_space *mapping;
2414 bool dirtied;
2415 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2416
2417 dirtied = set_page_dirty(page);
2418 VM_BUG_ON_PAGE(PageAnon(page), page);
2419
2420
2421
2422
2423
2424
2425 mapping = page_rmapping(page);
2426 unlock_page(page);
2427
2428 if ((dirtied || page_mkwrite) && mapping) {
2429
2430
2431
2432
2433 balance_dirty_pages_ratelimited(mapping);
2434 }
2435
2436 if (!page_mkwrite)
2437 file_update_time(vma->vm_file);
2438}
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448static inline void wp_page_reuse(struct vm_fault *vmf)
2449 __releases(vmf->ptl)
2450{
2451 struct vm_area_struct *vma = vmf->vma;
2452 struct page *page = vmf->page;
2453 pte_t entry;
2454
2455
2456
2457
2458
2459 if (page)
2460 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2461
2462 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2463 entry = pte_mkyoung(vmf->orig_pte);
2464 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2465 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2466 update_mmu_cache(vma, vmf->address, vmf->pte);
2467 pte_unmap_unlock(vmf->pte, vmf->ptl);
2468}
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2487{
2488 struct vm_area_struct *vma = vmf->vma;
2489 struct mm_struct *mm = vma->vm_mm;
2490 struct page *old_page = vmf->page;
2491 struct page *new_page = NULL;
2492 pte_t entry;
2493 int page_copied = 0;
2494 const unsigned long mmun_start = vmf->address & PAGE_MASK;
2495 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2496 struct mem_cgroup *memcg;
2497
2498 if (unlikely(anon_vma_prepare(vma)))
2499 goto oom;
2500
2501 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2502 new_page = alloc_zeroed_user_highpage_movable(vma,
2503 vmf->address);
2504 if (!new_page)
2505 goto oom;
2506 } else {
2507 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2508 vmf->address);
2509 if (!new_page)
2510 goto oom;
2511 cow_user_page(new_page, old_page, vmf->address, vma);
2512 }
2513
2514 if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
2515 goto oom_free_new;
2516
2517 __SetPageUptodate(new_page);
2518
2519 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2520
2521
2522
2523
2524 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2525 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2526 if (old_page) {
2527 if (!PageAnon(old_page)) {
2528 dec_mm_counter_fast(mm,
2529 mm_counter_file(old_page));
2530 inc_mm_counter_fast(mm, MM_ANONPAGES);
2531 }
2532 } else {
2533 inc_mm_counter_fast(mm, MM_ANONPAGES);
2534 }
2535 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2536 entry = mk_pte(new_page, vma->vm_page_prot);
2537 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2538
2539
2540
2541
2542
2543
2544 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2545 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2546 mem_cgroup_commit_charge(new_page, memcg, false, false);
2547 lru_cache_add_active_or_unevictable(new_page, vma);
2548
2549
2550
2551
2552
2553 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2554 update_mmu_cache(vma, vmf->address, vmf->pte);
2555 if (old_page) {
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578 page_remove_rmap(old_page, false);
2579 }
2580
2581
2582 new_page = old_page;
2583 page_copied = 1;
2584 } else {
2585 mem_cgroup_cancel_charge(new_page, memcg, false);
2586 }
2587
2588 if (new_page)
2589 put_page(new_page);
2590
2591 pte_unmap_unlock(vmf->pte, vmf->ptl);
2592
2593
2594
2595
2596 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
2597 if (old_page) {
2598
2599
2600
2601
2602 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2603 lock_page(old_page);
2604 if (PageMlocked(old_page))
2605 munlock_vma_page(old_page);
2606 unlock_page(old_page);
2607 }
2608 put_page(old_page);
2609 }
2610 return page_copied ? VM_FAULT_WRITE : 0;
2611oom_free_new:
2612 put_page(new_page);
2613oom:
2614 if (old_page)
2615 put_page(old_page);
2616 return VM_FAULT_OOM;
2617}
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2635{
2636 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2637 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2638 &vmf->ptl);
2639
2640
2641
2642
2643 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2644 pte_unmap_unlock(vmf->pte, vmf->ptl);
2645 return VM_FAULT_NOPAGE;
2646 }
2647 wp_page_reuse(vmf);
2648 return 0;
2649}
2650
2651
2652
2653
2654
2655static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
2656{
2657 struct vm_area_struct *vma = vmf->vma;
2658
2659 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2660 vm_fault_t ret;
2661
2662 pte_unmap_unlock(vmf->pte, vmf->ptl);
2663 vmf->flags |= FAULT_FLAG_MKWRITE;
2664 ret = vma->vm_ops->pfn_mkwrite(vmf);
2665 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2666 return ret;
2667 return finish_mkwrite_fault(vmf);
2668 }
2669 wp_page_reuse(vmf);
2670 return VM_FAULT_WRITE;
2671}
2672
2673static vm_fault_t wp_page_shared(struct vm_fault *vmf)
2674 __releases(vmf->ptl)
2675{
2676 struct vm_area_struct *vma = vmf->vma;
2677
2678 get_page(vmf->page);
2679
2680 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2681 vm_fault_t tmp;
2682
2683 pte_unmap_unlock(vmf->pte, vmf->ptl);
2684 tmp = do_page_mkwrite(vmf);
2685 if (unlikely(!tmp || (tmp &
2686 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2687 put_page(vmf->page);
2688 return tmp;
2689 }
2690 tmp = finish_mkwrite_fault(vmf);
2691 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2692 unlock_page(vmf->page);
2693 put_page(vmf->page);
2694 return tmp;
2695 }
2696 } else {
2697 wp_page_reuse(vmf);
2698 lock_page(vmf->page);
2699 }
2700 fault_dirty_shared_page(vma, vmf->page);
2701 put_page(vmf->page);
2702
2703 return VM_FAULT_WRITE;
2704}
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724static vm_fault_t do_wp_page(struct vm_fault *vmf)
2725 __releases(vmf->ptl)
2726{
2727 struct vm_area_struct *vma = vmf->vma;
2728
2729 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2730 if (!vmf->page) {
2731
2732
2733
2734
2735
2736
2737
2738 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2739 (VM_WRITE|VM_SHARED))
2740 return wp_pfn_shared(vmf);
2741
2742 pte_unmap_unlock(vmf->pte, vmf->ptl);
2743 return wp_page_copy(vmf);
2744 }
2745
2746
2747
2748
2749
2750 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2751 int total_map_swapcount;
2752 if (!trylock_page(vmf->page)) {
2753 get_page(vmf->page);
2754 pte_unmap_unlock(vmf->pte, vmf->ptl);
2755 lock_page(vmf->page);
2756 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2757 vmf->address, &vmf->ptl);
2758 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2759 unlock_page(vmf->page);
2760 pte_unmap_unlock(vmf->pte, vmf->ptl);
2761 put_page(vmf->page);
2762 return 0;
2763 }
2764 put_page(vmf->page);
2765 }
2766 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2767 if (total_map_swapcount == 1) {
2768
2769
2770
2771
2772
2773
2774
2775 page_move_anon_rmap(vmf->page, vma);
2776 }
2777 unlock_page(vmf->page);
2778 wp_page_reuse(vmf);
2779 return VM_FAULT_WRITE;
2780 }
2781 unlock_page(vmf->page);
2782 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2783 (VM_WRITE|VM_SHARED))) {
2784 return wp_page_shared(vmf);
2785 }
2786
2787
2788
2789
2790 get_page(vmf->page);
2791
2792 pte_unmap_unlock(vmf->pte, vmf->ptl);
2793 return wp_page_copy(vmf);
2794}
2795
2796static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2797 unsigned long start_addr, unsigned long end_addr,
2798 struct zap_details *details)
2799{
2800 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2801}
2802
2803static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
2804 struct zap_details *details)
2805{
2806 struct vm_area_struct *vma;
2807 pgoff_t vba, vea, zba, zea;
2808
2809 vma_interval_tree_foreach(vma, root,
2810 details->first_index, details->last_index) {
2811
2812 vba = vma->vm_pgoff;
2813 vea = vba + vma_pages(vma) - 1;
2814 zba = details->first_index;
2815 if (zba < vba)
2816 zba = vba;
2817 zea = details->last_index;
2818 if (zea > vea)
2819 zea = vea;
2820
2821 unmap_mapping_range_vma(vma,
2822 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2823 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2824 details);
2825 }
2826}
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
2841 pgoff_t nr, bool even_cows)
2842{
2843 struct zap_details details = { };
2844
2845 details.check_mapping = even_cows ? NULL : mapping;
2846 details.first_index = start;
2847 details.last_index = start + nr - 1;
2848 if (details.last_index < details.first_index)
2849 details.last_index = ULONG_MAX;
2850
2851 i_mmap_lock_write(mapping);
2852 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
2853 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2854 i_mmap_unlock_write(mapping);
2855}
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874void unmap_mapping_range(struct address_space *mapping,
2875 loff_t const holebegin, loff_t const holelen, int even_cows)
2876{
2877 pgoff_t hba = holebegin >> PAGE_SHIFT;
2878 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2879
2880
2881 if (sizeof(holelen) > sizeof(hlen)) {
2882 long long holeend =
2883 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2884 if (holeend & ~(long long)ULONG_MAX)
2885 hlen = ULONG_MAX - hba + 1;
2886 }
2887
2888 unmap_mapping_pages(mapping, hba, hlen, even_cows);
2889}
2890EXPORT_SYMBOL(unmap_mapping_range);
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900vm_fault_t do_swap_page(struct vm_fault *vmf)
2901{
2902 struct vm_area_struct *vma = vmf->vma;
2903 struct page *page = NULL, *swapcache;
2904 struct mem_cgroup *memcg;
2905 swp_entry_t entry;
2906 pte_t pte;
2907 int locked;
2908 int exclusive = 0;
2909 vm_fault_t ret = 0;
2910
2911 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
2912 goto out;
2913
2914 entry = pte_to_swp_entry(vmf->orig_pte);
2915 if (unlikely(non_swap_entry(entry))) {
2916 if (is_migration_entry(entry)) {
2917 migration_entry_wait(vma->vm_mm, vmf->pmd,
2918 vmf->address);
2919 } else if (is_device_private_entry(entry)) {
2920
2921
2922
2923
2924
2925 ret = device_private_entry_fault(vma, vmf->address, entry,
2926 vmf->flags, vmf->pmd);
2927 } else if (is_hwpoison_entry(entry)) {
2928 ret = VM_FAULT_HWPOISON;
2929 } else {
2930 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2931 ret = VM_FAULT_SIGBUS;
2932 }
2933 goto out;
2934 }
2935
2936
2937 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2938 page = lookup_swap_cache(entry, vma, vmf->address);
2939 swapcache = page;
2940
2941 if (!page) {
2942 struct swap_info_struct *si = swp_swap_info(entry);
2943
2944 if (si->flags & SWP_SYNCHRONOUS_IO &&
2945 __swap_count(si, entry) == 1) {
2946
2947 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2948 vmf->address);
2949 if (page) {
2950 __SetPageLocked(page);
2951 __SetPageSwapBacked(page);
2952 set_page_private(page, entry.val);
2953 lru_cache_add_anon(page);
2954 swap_readpage(page, true);
2955 }
2956 } else {
2957 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2958 vmf);
2959 swapcache = page;
2960 }
2961
2962 if (!page) {
2963
2964
2965
2966
2967 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2968 vmf->address, &vmf->ptl);
2969 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2970 ret = VM_FAULT_OOM;
2971 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2972 goto unlock;
2973 }
2974
2975
2976 ret = VM_FAULT_MAJOR;
2977 count_vm_event(PGMAJFAULT);
2978 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
2979 } else if (PageHWPoison(page)) {
2980
2981
2982
2983
2984 ret = VM_FAULT_HWPOISON;
2985 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2986 goto out_release;
2987 }
2988
2989 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2990
2991 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2992 if (!locked) {
2993 ret |= VM_FAULT_RETRY;
2994 goto out_release;
2995 }
2996
2997
2998
2999
3000
3001
3002
3003 if (unlikely((!PageSwapCache(page) ||
3004 page_private(page) != entry.val)) && swapcache)
3005 goto out_page;
3006
3007 page = ksm_might_need_to_copy(page, vma, vmf->address);
3008 if (unlikely(!page)) {
3009 ret = VM_FAULT_OOM;
3010 page = swapcache;
3011 goto out_page;
3012 }
3013
3014 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
3015 &memcg, false)) {
3016 ret = VM_FAULT_OOM;
3017 goto out_page;
3018 }
3019
3020
3021
3022
3023 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3024 &vmf->ptl);
3025 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3026 goto out_nomap;
3027
3028 if (unlikely(!PageUptodate(page))) {
3029 ret = VM_FAULT_SIGBUS;
3030 goto out_nomap;
3031 }
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3044 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3045 pte = mk_pte(page, vma->vm_page_prot);
3046 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
3047 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3048 vmf->flags &= ~FAULT_FLAG_WRITE;
3049 ret |= VM_FAULT_WRITE;
3050 exclusive = RMAP_EXCLUSIVE;
3051 }
3052 flush_icache_page(vma, page);
3053 if (pte_swp_soft_dirty(vmf->orig_pte))
3054 pte = pte_mksoft_dirty(pte);
3055 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3056 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
3057 vmf->orig_pte = pte;
3058
3059
3060 if (unlikely(page != swapcache && swapcache)) {
3061 page_add_new_anon_rmap(page, vma, vmf->address, false);
3062 mem_cgroup_commit_charge(page, memcg, false, false);
3063 lru_cache_add_active_or_unevictable(page, vma);
3064 } else {
3065 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3066 mem_cgroup_commit_charge(page, memcg, true, false);
3067 activate_page(page);
3068 }
3069
3070 swap_free(entry);
3071 if (mem_cgroup_swap_full(page) ||
3072 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3073 try_to_free_swap(page);
3074 unlock_page(page);
3075 if (page != swapcache && swapcache) {
3076
3077
3078
3079
3080
3081
3082
3083
3084 unlock_page(swapcache);
3085 put_page(swapcache);
3086 }
3087
3088 if (vmf->flags & FAULT_FLAG_WRITE) {
3089 ret |= do_wp_page(vmf);
3090 if (ret & VM_FAULT_ERROR)
3091 ret &= VM_FAULT_ERROR;
3092 goto out;
3093 }
3094
3095
3096 update_mmu_cache(vma, vmf->address, vmf->pte);
3097unlock:
3098 pte_unmap_unlock(vmf->pte, vmf->ptl);
3099out:
3100 return ret;
3101out_nomap:
3102 mem_cgroup_cancel_charge(page, memcg, false);
3103 pte_unmap_unlock(vmf->pte, vmf->ptl);
3104out_page:
3105 unlock_page(page);
3106out_release:
3107 put_page(page);
3108 if (page != swapcache && swapcache) {
3109 unlock_page(swapcache);
3110 put_page(swapcache);
3111 }
3112 return ret;
3113}
3114
3115
3116
3117
3118
3119
3120static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
3121{
3122 struct vm_area_struct *vma = vmf->vma;
3123 struct mem_cgroup *memcg;
3124 struct page *page;
3125 vm_fault_t ret = 0;
3126 pte_t entry;
3127
3128
3129 if (vma->vm_flags & VM_SHARED)
3130 return VM_FAULT_SIGBUS;
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142 if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
3143 return VM_FAULT_OOM;
3144
3145
3146 if (unlikely(pmd_trans_unstable(vmf->pmd)))
3147 return 0;
3148
3149
3150 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3151 !mm_forbids_zeropage(vma->vm_mm)) {
3152 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3153 vma->vm_page_prot));
3154 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3155 vmf->address, &vmf->ptl);
3156 if (!pte_none(*vmf->pte))
3157 goto unlock;
3158 ret = check_stable_address_space(vma->vm_mm);
3159 if (ret)
3160 goto unlock;
3161
3162 if (userfaultfd_missing(vma)) {
3163 pte_unmap_unlock(vmf->pte, vmf->ptl);
3164 return handle_userfault(vmf, VM_UFFD_MISSING);
3165 }
3166 goto setpte;
3167 }
3168
3169
3170 if (unlikely(anon_vma_prepare(vma)))
3171 goto oom;
3172 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
3173 if (!page)
3174 goto oom;
3175
3176 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
3177 false))
3178 goto oom_free_page;
3179
3180
3181
3182
3183
3184
3185 __SetPageUptodate(page);
3186
3187 entry = mk_pte(page, vma->vm_page_prot);
3188 if (vma->vm_flags & VM_WRITE)
3189 entry = pte_mkwrite(pte_mkdirty(entry));
3190
3191 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3192 &vmf->ptl);
3193 if (!pte_none(*vmf->pte))
3194 goto release;
3195
3196 ret = check_stable_address_space(vma->vm_mm);
3197 if (ret)
3198 goto release;
3199
3200
3201 if (userfaultfd_missing(vma)) {
3202 pte_unmap_unlock(vmf->pte, vmf->ptl);
3203 mem_cgroup_cancel_charge(page, memcg, false);
3204 put_page(page);
3205 return handle_userfault(vmf, VM_UFFD_MISSING);
3206 }
3207
3208 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3209 page_add_new_anon_rmap(page, vma, vmf->address, false);
3210 mem_cgroup_commit_charge(page, memcg, false, false);
3211 lru_cache_add_active_or_unevictable(page, vma);
3212setpte:
3213 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3214
3215
3216 update_mmu_cache(vma, vmf->address, vmf->pte);
3217unlock:
3218 pte_unmap_unlock(vmf->pte, vmf->ptl);
3219 return ret;
3220release:
3221 mem_cgroup_cancel_charge(page, memcg, false);
3222 put_page(page);
3223 goto unlock;
3224oom_free_page:
3225 put_page(page);
3226oom:
3227 return VM_FAULT_OOM;
3228}
3229
3230
3231
3232
3233
3234
3235static vm_fault_t __do_fault(struct vm_fault *vmf)
3236{
3237 struct vm_area_struct *vma = vmf->vma;
3238 vm_fault_t ret;
3239
3240 ret = vma->vm_ops->fault(vmf);
3241 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3242 VM_FAULT_DONE_COW)))
3243 return ret;
3244
3245 if (unlikely(PageHWPoison(vmf->page))) {
3246 if (ret & VM_FAULT_LOCKED)
3247 unlock_page(vmf->page);
3248 put_page(vmf->page);
3249 vmf->page = NULL;
3250 return VM_FAULT_HWPOISON;
3251 }
3252
3253 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3254 lock_page(vmf->page);
3255 else
3256 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3257
3258 return ret;
3259}
3260
3261
3262
3263
3264
3265
3266
3267static int pmd_devmap_trans_unstable(pmd_t *pmd)
3268{
3269 return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
3270}
3271
3272static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
3273{
3274 struct vm_area_struct *vma = vmf->vma;
3275
3276 if (!pmd_none(*vmf->pmd))
3277 goto map_pte;
3278 if (vmf->prealloc_pte) {
3279 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3280 if (unlikely(!pmd_none(*vmf->pmd))) {
3281 spin_unlock(vmf->ptl);
3282 goto map_pte;
3283 }
3284
3285 mm_inc_nr_ptes(vma->vm_mm);
3286 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3287 spin_unlock(vmf->ptl);
3288 vmf->prealloc_pte = NULL;
3289 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
3290 return VM_FAULT_OOM;
3291 }
3292map_pte:
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304 if (pmd_devmap_trans_unstable(vmf->pmd))
3305 return VM_FAULT_NOPAGE;
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3317 &vmf->ptl);
3318 return 0;
3319}
3320
3321#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3322
3323#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
3324static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
3325 unsigned long haddr)
3326{
3327 if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
3328 (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
3329 return false;
3330 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
3331 return false;
3332 return true;
3333}
3334
3335static void deposit_prealloc_pte(struct vm_fault *vmf)
3336{
3337 struct vm_area_struct *vma = vmf->vma;
3338
3339 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3340
3341
3342
3343
3344 mm_inc_nr_ptes(vma->vm_mm);
3345 vmf->prealloc_pte = NULL;
3346}
3347
3348static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3349{
3350 struct vm_area_struct *vma = vmf->vma;
3351 bool write = vmf->flags & FAULT_FLAG_WRITE;
3352 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3353 pmd_t entry;
3354 int i;
3355 vm_fault_t ret;
3356
3357 if (!transhuge_vma_suitable(vma, haddr))
3358 return VM_FAULT_FALLBACK;
3359
3360 ret = VM_FAULT_FALLBACK;
3361 page = compound_head(page);
3362
3363
3364
3365
3366
3367 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3368 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
3369 if (!vmf->prealloc_pte)
3370 return VM_FAULT_OOM;
3371 smp_wmb();
3372 }
3373
3374 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3375 if (unlikely(!pmd_none(*vmf->pmd)))
3376 goto out;
3377
3378 for (i = 0; i < HPAGE_PMD_NR; i++)
3379 flush_icache_page(vma, page + i);
3380
3381 entry = mk_huge_pmd(page, vma->vm_page_prot);
3382 if (write)
3383 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3384
3385 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
3386 page_add_file_rmap(page, true);
3387
3388
3389
3390 if (arch_needs_pgtable_deposit())
3391 deposit_prealloc_pte(vmf);
3392
3393 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3394
3395 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3396
3397
3398 ret = 0;
3399 count_vm_event(THP_FILE_MAPPED);
3400out:
3401 spin_unlock(vmf->ptl);
3402 return ret;
3403}
3404#else
3405static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3406{
3407 BUILD_BUG();
3408 return 0;
3409}
3410#endif
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3427 struct page *page)
3428{
3429 struct vm_area_struct *vma = vmf->vma;
3430 bool write = vmf->flags & FAULT_FLAG_WRITE;
3431 pte_t entry;
3432 vm_fault_t ret;
3433
3434 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3435 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3436
3437 VM_BUG_ON_PAGE(memcg, page);
3438
3439 ret = do_set_pmd(vmf, page);
3440 if (ret != VM_FAULT_FALLBACK)
3441 return ret;
3442 }
3443
3444 if (!vmf->pte) {
3445 ret = pte_alloc_one_map(vmf);
3446 if (ret)
3447 return ret;
3448 }
3449
3450
3451 if (unlikely(!pte_none(*vmf->pte)))
3452 return VM_FAULT_NOPAGE;
3453
3454 flush_icache_page(vma, page);
3455 entry = mk_pte(page, vma->vm_page_prot);
3456 if (write)
3457 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3458
3459 if (write && !(vma->vm_flags & VM_SHARED)) {
3460 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3461 page_add_new_anon_rmap(page, vma, vmf->address, false);
3462 mem_cgroup_commit_charge(page, memcg, false, false);
3463 lru_cache_add_active_or_unevictable(page, vma);
3464 } else {
3465 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3466 page_add_file_rmap(page, false);
3467 }
3468 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3469
3470
3471 update_mmu_cache(vma, vmf->address, vmf->pte);
3472
3473 return 0;
3474}
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491vm_fault_t finish_fault(struct vm_fault *vmf)
3492{
3493 struct page *page;
3494 vm_fault_t ret = 0;
3495
3496
3497 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3498 !(vmf->vma->vm_flags & VM_SHARED))
3499 page = vmf->cow_page;
3500 else
3501 page = vmf->page;
3502
3503
3504
3505
3506
3507 if (!(vmf->vma->vm_flags & VM_SHARED))
3508 ret = check_stable_address_space(vmf->vma->vm_mm);
3509 if (!ret)
3510 ret = alloc_set_pte(vmf, vmf->memcg, page);
3511 if (vmf->pte)
3512 pte_unmap_unlock(vmf->pte, vmf->ptl);
3513 return ret;
3514}
3515
3516static unsigned long fault_around_bytes __read_mostly =
3517 rounddown_pow_of_two(65536);
3518
3519#ifdef CONFIG_DEBUG_FS
3520static int fault_around_bytes_get(void *data, u64 *val)
3521{
3522 *val = fault_around_bytes;
3523 return 0;
3524}
3525
3526
3527
3528
3529
3530static int fault_around_bytes_set(void *data, u64 val)
3531{
3532 if (val / PAGE_SIZE > PTRS_PER_PTE)
3533 return -EINVAL;
3534 if (val > PAGE_SIZE)
3535 fault_around_bytes = rounddown_pow_of_two(val);
3536 else
3537 fault_around_bytes = PAGE_SIZE;
3538 return 0;
3539}
3540DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3541 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3542
3543static int __init fault_around_debugfs(void)
3544{
3545 void *ret;
3546
3547 ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3548 &fault_around_bytes_fops);
3549 if (!ret)
3550 pr_warn("Failed to create fault_around_bytes in debugfs");
3551 return 0;
3552}
3553late_initcall(fault_around_debugfs);
3554#endif
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580static vm_fault_t do_fault_around(struct vm_fault *vmf)
3581{
3582 unsigned long address = vmf->address, nr_pages, mask;
3583 pgoff_t start_pgoff = vmf->pgoff;
3584 pgoff_t end_pgoff;
3585 int off;
3586 vm_fault_t ret = 0;
3587
3588 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3589 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3590
3591 vmf->address = max(address & mask, vmf->vma->vm_start);
3592 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3593 start_pgoff -= off;
3594
3595
3596
3597
3598
3599 end_pgoff = start_pgoff -
3600 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3601 PTRS_PER_PTE - 1;
3602 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3603 start_pgoff + nr_pages - 1);
3604
3605 if (pmd_none(*vmf->pmd)) {
3606 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3607 vmf->address);
3608 if (!vmf->prealloc_pte)
3609 goto out;
3610 smp_wmb();
3611 }
3612
3613 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3614
3615
3616 if (pmd_trans_huge(*vmf->pmd)) {
3617 ret = VM_FAULT_NOPAGE;
3618 goto out;
3619 }
3620
3621
3622 if (!vmf->pte)
3623 goto out;
3624
3625
3626 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3627 if (!pte_none(*vmf->pte))
3628 ret = VM_FAULT_NOPAGE;
3629 pte_unmap_unlock(vmf->pte, vmf->ptl);
3630out:
3631 vmf->address = address;
3632 vmf->pte = NULL;
3633 return ret;
3634}
3635
3636static vm_fault_t do_read_fault(struct vm_fault *vmf)
3637{
3638 struct vm_area_struct *vma = vmf->vma;
3639 vm_fault_t ret = 0;
3640
3641
3642
3643
3644
3645
3646 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3647 ret = do_fault_around(vmf);
3648 if (ret)
3649 return ret;
3650 }
3651
3652 ret = __do_fault(vmf);
3653 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3654 return ret;
3655
3656 ret |= finish_fault(vmf);
3657 unlock_page(vmf->page);
3658 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3659 put_page(vmf->page);
3660 return ret;
3661}
3662
3663static vm_fault_t do_cow_fault(struct vm_fault *vmf)
3664{
3665 struct vm_area_struct *vma = vmf->vma;
3666 vm_fault_t ret;
3667
3668 if (unlikely(anon_vma_prepare(vma)))
3669 return VM_FAULT_OOM;
3670
3671 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3672 if (!vmf->cow_page)
3673 return VM_FAULT_OOM;
3674
3675 if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3676 &vmf->memcg, false)) {
3677 put_page(vmf->cow_page);
3678 return VM_FAULT_OOM;
3679 }
3680
3681 ret = __do_fault(vmf);
3682 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3683 goto uncharge_out;
3684 if (ret & VM_FAULT_DONE_COW)
3685 return ret;
3686
3687 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3688 __SetPageUptodate(vmf->cow_page);
3689
3690 ret |= finish_fault(vmf);
3691 unlock_page(vmf->page);
3692 put_page(vmf->page);
3693 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3694 goto uncharge_out;
3695 return ret;
3696uncharge_out:
3697 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3698 put_page(vmf->cow_page);
3699 return ret;
3700}
3701
3702static vm_fault_t do_shared_fault(struct vm_fault *vmf)
3703{
3704 struct vm_area_struct *vma = vmf->vma;
3705 vm_fault_t ret, tmp;
3706
3707 ret = __do_fault(vmf);
3708 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3709 return ret;
3710
3711
3712
3713
3714
3715 if (vma->vm_ops->page_mkwrite) {
3716 unlock_page(vmf->page);
3717 tmp = do_page_mkwrite(vmf);
3718 if (unlikely(!tmp ||
3719 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3720 put_page(vmf->page);
3721 return tmp;
3722 }
3723 }
3724
3725 ret |= finish_fault(vmf);
3726 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3727 VM_FAULT_RETRY))) {
3728 unlock_page(vmf->page);
3729 put_page(vmf->page);
3730 return ret;
3731 }
3732
3733 fault_dirty_shared_page(vma, vmf->page);
3734 return ret;
3735}
3736
3737
3738
3739
3740
3741
3742
3743static vm_fault_t do_fault(struct vm_fault *vmf)
3744{
3745 struct vm_area_struct *vma = vmf->vma;
3746 vm_fault_t ret;
3747
3748
3749 if (!vma->vm_ops->fault)
3750 ret = VM_FAULT_SIGBUS;
3751 else if (!(vmf->flags & FAULT_FLAG_WRITE))
3752 ret = do_read_fault(vmf);
3753 else if (!(vma->vm_flags & VM_SHARED))
3754 ret = do_cow_fault(vmf);
3755 else
3756 ret = do_shared_fault(vmf);
3757
3758
3759 if (vmf->prealloc_pte) {
3760 pte_free(vma->vm_mm, vmf->prealloc_pte);
3761 vmf->prealloc_pte = NULL;
3762 }
3763 return ret;
3764}
3765
3766static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3767 unsigned long addr, int page_nid,
3768 int *flags)
3769{
3770 get_page(page);
3771
3772 count_vm_numa_event(NUMA_HINT_FAULTS);
3773 if (page_nid == numa_node_id()) {
3774 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3775 *flags |= TNF_FAULT_LOCAL;
3776 }
3777
3778 return mpol_misplaced(page, vma, addr);
3779}
3780
3781static vm_fault_t do_numa_page(struct vm_fault *vmf)
3782{
3783 struct vm_area_struct *vma = vmf->vma;
3784 struct page *page = NULL;
3785 int page_nid = -1;
3786 int last_cpupid;
3787 int target_nid;
3788 bool migrated = false;
3789 pte_t pte;
3790 bool was_writable = pte_savedwrite(vmf->orig_pte);
3791 int flags = 0;
3792
3793
3794
3795
3796
3797
3798 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3799 spin_lock(vmf->ptl);
3800 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
3801 pte_unmap_unlock(vmf->pte, vmf->ptl);
3802 goto out;
3803 }
3804
3805
3806
3807
3808
3809 pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
3810 pte = pte_modify(pte, vma->vm_page_prot);
3811 pte = pte_mkyoung(pte);
3812 if (was_writable)
3813 pte = pte_mkwrite(pte);
3814 ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
3815 update_mmu_cache(vma, vmf->address, vmf->pte);
3816
3817 page = vm_normal_page(vma, vmf->address, pte);
3818 if (!page) {
3819 pte_unmap_unlock(vmf->pte, vmf->ptl);
3820 return 0;
3821 }
3822
3823
3824 if (PageCompound(page)) {
3825 pte_unmap_unlock(vmf->pte, vmf->ptl);
3826 return 0;
3827 }
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837 if (!pte_write(pte))
3838 flags |= TNF_NO_GROUP;
3839
3840
3841
3842
3843
3844 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3845 flags |= TNF_SHARED;
3846
3847 last_cpupid = page_cpupid_last(page);
3848 page_nid = page_to_nid(page);
3849 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3850 &flags);
3851 pte_unmap_unlock(vmf->pte, vmf->ptl);
3852 if (target_nid == -1) {
3853 put_page(page);
3854 goto out;
3855 }
3856
3857
3858 migrated = migrate_misplaced_page(page, vma, target_nid);
3859 if (migrated) {
3860 page_nid = target_nid;
3861 flags |= TNF_MIGRATED;
3862 } else
3863 flags |= TNF_MIGRATE_FAIL;
3864
3865out:
3866 if (page_nid != -1)
3867 task_numa_fault(last_cpupid, page_nid, 1, flags);
3868 return 0;
3869}
3870
3871static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
3872{
3873 if (vma_is_anonymous(vmf->vma))
3874 return do_huge_pmd_anonymous_page(vmf);
3875 if (vmf->vma->vm_ops->huge_fault)
3876 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3877 return VM_FAULT_FALLBACK;
3878}
3879
3880
3881static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3882{
3883 if (vma_is_anonymous(vmf->vma))
3884 return do_huge_pmd_wp_page(vmf, orig_pmd);
3885 if (vmf->vma->vm_ops->huge_fault)
3886 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3887
3888
3889 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3890 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3891
3892 return VM_FAULT_FALLBACK;
3893}
3894
3895static inline bool vma_is_accessible(struct vm_area_struct *vma)
3896{
3897 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3898}
3899
3900static vm_fault_t create_huge_pud(struct vm_fault *vmf)
3901{
3902#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3903
3904 if (vma_is_anonymous(vmf->vma))
3905 return VM_FAULT_FALLBACK;
3906 if (vmf->vma->vm_ops->huge_fault)
3907 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3908#endif
3909 return VM_FAULT_FALLBACK;
3910}
3911
3912static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3913{
3914#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3915
3916 if (vma_is_anonymous(vmf->vma))
3917 return VM_FAULT_FALLBACK;
3918 if (vmf->vma->vm_ops->huge_fault)
3919 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3920#endif
3921 return VM_FAULT_FALLBACK;
3922}
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
3940{
3941 pte_t entry;
3942
3943 if (unlikely(pmd_none(*vmf->pmd))) {
3944
3945
3946
3947
3948
3949
3950 vmf->pte = NULL;
3951 } else {
3952
3953 if (pmd_devmap_trans_unstable(vmf->pmd))
3954 return 0;
3955
3956
3957
3958
3959
3960
3961 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3962 vmf->orig_pte = *vmf->pte;
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972 barrier();
3973 if (pte_none(vmf->orig_pte)) {
3974 pte_unmap(vmf->pte);
3975 vmf->pte = NULL;
3976 }
3977 }
3978
3979 if (!vmf->pte) {
3980 if (vma_is_anonymous(vmf->vma))
3981 return do_anonymous_page(vmf);
3982 else
3983 return do_fault(vmf);
3984 }
3985
3986 if (!pte_present(vmf->orig_pte))
3987 return do_swap_page(vmf);
3988
3989 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3990 return do_numa_page(vmf);
3991
3992 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3993 spin_lock(vmf->ptl);
3994 entry = vmf->orig_pte;
3995 if (unlikely(!pte_same(*vmf->pte, entry)))
3996 goto unlock;
3997 if (vmf->flags & FAULT_FLAG_WRITE) {
3998 if (!pte_write(entry))
3999 return do_wp_page(vmf);
4000 entry = pte_mkdirty(entry);
4001 }
4002 entry = pte_mkyoung(entry);
4003 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
4004 vmf->flags & FAULT_FLAG_WRITE)) {
4005 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
4006 } else {
4007
4008
4009
4010
4011
4012
4013 if (vmf->flags & FAULT_FLAG_WRITE)
4014 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
4015 }
4016unlock:
4017 pte_unmap_unlock(vmf->pte, vmf->ptl);
4018 return 0;
4019}
4020
4021
4022
4023
4024
4025
4026
4027static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
4028 unsigned long address, unsigned int flags)
4029{
4030 struct vm_fault vmf = {
4031 .vma = vma,
4032 .address = address & PAGE_MASK,
4033 .flags = flags,
4034 .pgoff = linear_page_index(vma, address),
4035 .gfp_mask = __get_fault_gfp_mask(vma),
4036 };
4037 unsigned int dirty = flags & FAULT_FLAG_WRITE;
4038 struct mm_struct *mm = vma->vm_mm;
4039 pgd_t *pgd;
4040 p4d_t *p4d;
4041 vm_fault_t ret;
4042
4043 pgd = pgd_offset(mm, address);
4044 p4d = p4d_alloc(mm, pgd, address);
4045 if (!p4d)
4046 return VM_FAULT_OOM;
4047
4048 vmf.pud = pud_alloc(mm, p4d, address);
4049 if (!vmf.pud)
4050 return VM_FAULT_OOM;
4051 if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
4052 ret = create_huge_pud(&vmf);
4053 if (!(ret & VM_FAULT_FALLBACK))
4054 return ret;
4055 } else {
4056 pud_t orig_pud = *vmf.pud;
4057
4058 barrier();
4059 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
4060
4061
4062
4063 if (dirty && !pud_write(orig_pud)) {
4064 ret = wp_huge_pud(&vmf, orig_pud);
4065 if (!(ret & VM_FAULT_FALLBACK))
4066 return ret;
4067 } else {
4068 huge_pud_set_accessed(&vmf, orig_pud);
4069 return 0;
4070 }
4071 }
4072 }
4073
4074 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4075 if (!vmf.pmd)
4076 return VM_FAULT_OOM;
4077 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
4078 ret = create_huge_pmd(&vmf);
4079 if (!(ret & VM_FAULT_FALLBACK))
4080 return ret;
4081 } else {
4082 pmd_t orig_pmd = *vmf.pmd;
4083
4084 barrier();
4085 if (unlikely(is_swap_pmd(orig_pmd))) {
4086 VM_BUG_ON(thp_migration_supported() &&
4087 !is_pmd_migration_entry(orig_pmd));
4088 if (is_pmd_migration_entry(orig_pmd))
4089 pmd_migration_entry_wait(mm, vmf.pmd);
4090 return 0;
4091 }
4092 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
4093 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
4094 return do_huge_pmd_numa_page(&vmf, orig_pmd);
4095
4096 if (dirty && !pmd_write(orig_pmd)) {
4097 ret = wp_huge_pmd(&vmf, orig_pmd);
4098 if (!(ret & VM_FAULT_FALLBACK))
4099 return ret;
4100 } else {
4101 huge_pmd_set_accessed(&vmf, orig_pmd);
4102 return 0;
4103 }
4104 }
4105 }
4106
4107 return handle_pte_fault(&vmf);
4108}
4109
4110
4111
4112
4113
4114
4115
4116vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4117 unsigned int flags)
4118{
4119 vm_fault_t ret;
4120
4121 __set_current_state(TASK_RUNNING);
4122
4123 count_vm_event(PGFAULT);
4124 count_memcg_event_mm(vma->vm_mm, PGFAULT);
4125
4126
4127 check_sync_rss_stat(current);
4128
4129 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
4130 flags & FAULT_FLAG_INSTRUCTION,
4131 flags & FAULT_FLAG_REMOTE))
4132 return VM_FAULT_SIGSEGV;
4133
4134
4135
4136
4137
4138 if (flags & FAULT_FLAG_USER)
4139 mem_cgroup_enter_user_fault();
4140
4141 if (unlikely(is_vm_hugetlb_page(vma)))
4142 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4143 else
4144 ret = __handle_mm_fault(vma, address, flags);
4145
4146 if (flags & FAULT_FLAG_USER) {
4147 mem_cgroup_exit_user_fault();
4148
4149
4150
4151
4152
4153
4154 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4155 mem_cgroup_oom_synchronize(false);
4156 }
4157
4158 return ret;
4159}
4160EXPORT_SYMBOL_GPL(handle_mm_fault);
4161
4162#ifndef __PAGETABLE_P4D_FOLDED
4163
4164
4165
4166
4167int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4168{
4169 p4d_t *new = p4d_alloc_one(mm, address);
4170 if (!new)
4171 return -ENOMEM;
4172
4173 smp_wmb();
4174
4175 spin_lock(&mm->page_table_lock);
4176 if (pgd_present(*pgd))
4177 p4d_free(mm, new);
4178 else
4179 pgd_populate(mm, pgd, new);
4180 spin_unlock(&mm->page_table_lock);
4181 return 0;
4182}
4183#endif
4184
4185#ifndef __PAGETABLE_PUD_FOLDED
4186
4187
4188
4189
4190int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4191{
4192 pud_t *new = pud_alloc_one(mm, address);
4193 if (!new)
4194 return -ENOMEM;
4195
4196 smp_wmb();
4197
4198 spin_lock(&mm->page_table_lock);
4199#ifndef __ARCH_HAS_5LEVEL_HACK
4200 if (!p4d_present(*p4d)) {
4201 mm_inc_nr_puds(mm);
4202 p4d_populate(mm, p4d, new);
4203 } else
4204 pud_free(mm, new);
4205#else
4206 if (!pgd_present(*p4d)) {
4207 mm_inc_nr_puds(mm);
4208 pgd_populate(mm, p4d, new);
4209 } else
4210 pud_free(mm, new);
4211#endif
4212 spin_unlock(&mm->page_table_lock);
4213 return 0;
4214}
4215#endif
4216
4217#ifndef __PAGETABLE_PMD_FOLDED
4218
4219
4220
4221
4222int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4223{
4224 spinlock_t *ptl;
4225 pmd_t *new = pmd_alloc_one(mm, address);
4226 if (!new)
4227 return -ENOMEM;
4228
4229 smp_wmb();
4230
4231 ptl = pud_lock(mm, pud);
4232#ifndef __ARCH_HAS_4LEVEL_HACK
4233 if (!pud_present(*pud)) {
4234 mm_inc_nr_pmds(mm);
4235 pud_populate(mm, pud, new);
4236 } else
4237 pmd_free(mm, new);
4238#else
4239 if (!pgd_present(*pud)) {
4240 mm_inc_nr_pmds(mm);
4241 pgd_populate(mm, pud, new);
4242 } else
4243 pmd_free(mm, new);
4244#endif
4245 spin_unlock(ptl);
4246 return 0;
4247}
4248#endif
4249
4250static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4251 unsigned long *start, unsigned long *end,
4252 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4253{
4254 pgd_t *pgd;
4255 p4d_t *p4d;
4256 pud_t *pud;
4257 pmd_t *pmd;
4258 pte_t *ptep;
4259
4260 pgd = pgd_offset(mm, address);
4261 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4262 goto out;
4263
4264 p4d = p4d_offset(pgd, address);
4265 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4266 goto out;
4267
4268 pud = pud_offset(p4d, address);
4269 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4270 goto out;
4271
4272 pmd = pmd_offset(pud, address);
4273 VM_BUG_ON(pmd_trans_huge(*pmd));
4274
4275 if (pmd_huge(*pmd)) {
4276 if (!pmdpp)
4277 goto out;
4278
4279 if (start && end) {
4280 *start = address & PMD_MASK;
4281 *end = *start + PMD_SIZE;
4282 mmu_notifier_invalidate_range_start(mm, *start, *end);
4283 }
4284 *ptlp = pmd_lock(mm, pmd);
4285 if (pmd_huge(*pmd)) {
4286 *pmdpp = pmd;
4287 return 0;
4288 }
4289 spin_unlock(*ptlp);
4290 if (start && end)
4291 mmu_notifier_invalidate_range_end(mm, *start, *end);
4292 }
4293
4294 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4295 goto out;
4296
4297 if (start && end) {
4298 *start = address & PAGE_MASK;
4299 *end = *start + PAGE_SIZE;
4300 mmu_notifier_invalidate_range_start(mm, *start, *end);
4301 }
4302 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4303 if (!pte_present(*ptep))
4304 goto unlock;
4305 *ptepp = ptep;
4306 return 0;
4307unlock:
4308 pte_unmap_unlock(ptep, *ptlp);
4309 if (start && end)
4310 mmu_notifier_invalidate_range_end(mm, *start, *end);
4311out:
4312 return -EINVAL;
4313}
4314
4315static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4316 pte_t **ptepp, spinlock_t **ptlp)
4317{
4318 int res;
4319
4320
4321 (void) __cond_lock(*ptlp,
4322 !(res = __follow_pte_pmd(mm, address, NULL, NULL,
4323 ptepp, NULL, ptlp)));
4324 return res;
4325}
4326
4327int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4328 unsigned long *start, unsigned long *end,
4329 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4330{
4331 int res;
4332
4333
4334 (void) __cond_lock(*ptlp,
4335 !(res = __follow_pte_pmd(mm, address, start, end,
4336 ptepp, pmdpp, ptlp)));
4337 return res;
4338}
4339EXPORT_SYMBOL(follow_pte_pmd);
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4352 unsigned long *pfn)
4353{
4354 int ret = -EINVAL;
4355 spinlock_t *ptl;
4356 pte_t *ptep;
4357
4358 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4359 return ret;
4360
4361 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4362 if (ret)
4363 return ret;
4364 *pfn = pte_pfn(*ptep);
4365 pte_unmap_unlock(ptep, ptl);
4366 return 0;
4367}
4368EXPORT_SYMBOL(follow_pfn);
4369
4370#ifdef CONFIG_HAVE_IOREMAP_PROT
4371int follow_phys(struct vm_area_struct *vma,
4372 unsigned long address, unsigned int flags,
4373 unsigned long *prot, resource_size_t *phys)
4374{
4375 int ret = -EINVAL;
4376 pte_t *ptep, pte;
4377 spinlock_t *ptl;
4378
4379 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4380 goto out;
4381
4382 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4383 goto out;
4384 pte = *ptep;
4385
4386 if ((flags & FOLL_WRITE) && !pte_write(pte))
4387 goto unlock;
4388
4389 *prot = pgprot_val(pte_pgprot(pte));
4390 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4391
4392 ret = 0;
4393unlock:
4394 pte_unmap_unlock(ptep, ptl);
4395out:
4396 return ret;
4397}
4398
4399int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4400 void *buf, int len, int write)
4401{
4402 resource_size_t phys_addr;
4403 unsigned long prot = 0;
4404 void __iomem *maddr;
4405 int offset = addr & (PAGE_SIZE-1);
4406
4407 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4408 return -EINVAL;
4409
4410 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4411 if (!maddr)
4412 return -ENOMEM;
4413
4414 if (write)
4415 memcpy_toio(maddr + offset, buf, len);
4416 else
4417 memcpy_fromio(buf, maddr + offset, len);
4418 iounmap(maddr);
4419
4420 return len;
4421}
4422EXPORT_SYMBOL_GPL(generic_access_phys);
4423#endif
4424
4425
4426
4427
4428
4429int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4430 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4431{
4432 struct vm_area_struct *vma;
4433 void *old_buf = buf;
4434 int write = gup_flags & FOLL_WRITE;
4435
4436 down_read(&mm->mmap_sem);
4437
4438 while (len) {
4439 int bytes, ret, offset;
4440 void *maddr;
4441 struct page *page = NULL;
4442
4443 ret = get_user_pages_remote(tsk, mm, addr, 1,
4444 gup_flags, &page, &vma, NULL);
4445 if (ret <= 0) {
4446#ifndef CONFIG_HAVE_IOREMAP_PROT
4447 break;
4448#else
4449
4450
4451
4452
4453 vma = find_vma(mm, addr);
4454 if (!vma || vma->vm_start > addr)
4455 break;
4456 if (vma->vm_ops && vma->vm_ops->access)
4457 ret = vma->vm_ops->access(vma, addr, buf,
4458 len, write);
4459 if (ret <= 0)
4460 break;
4461 bytes = ret;
4462#endif
4463 } else {
4464 bytes = len;
4465 offset = addr & (PAGE_SIZE-1);
4466 if (bytes > PAGE_SIZE-offset)
4467 bytes = PAGE_SIZE-offset;
4468
4469 maddr = kmap(page);
4470 if (write) {
4471 copy_to_user_page(vma, page, addr,
4472 maddr + offset, buf, bytes);
4473 set_page_dirty_lock(page);
4474 } else {
4475 copy_from_user_page(vma, page, addr,
4476 buf, maddr + offset, bytes);
4477 }
4478 kunmap(page);
4479 put_page(page);
4480 }
4481 len -= bytes;
4482 buf += bytes;
4483 addr += bytes;
4484 }
4485 up_read(&mm->mmap_sem);
4486
4487 return buf - old_buf;
4488}
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4501 void *buf, int len, unsigned int gup_flags)
4502{
4503 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4504}
4505
4506
4507
4508
4509
4510
4511int access_process_vm(struct task_struct *tsk, unsigned long addr,
4512 void *buf, int len, unsigned int gup_flags)
4513{
4514 struct mm_struct *mm;
4515 int ret;
4516
4517 mm = get_task_mm(tsk);
4518 if (!mm)
4519 return 0;
4520
4521 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4522
4523 mmput(mm);
4524
4525 return ret;
4526}
4527EXPORT_SYMBOL_GPL(access_process_vm);
4528
4529
4530
4531
4532void print_vma_addr(char *prefix, unsigned long ip)
4533{
4534 struct mm_struct *mm = current->mm;
4535 struct vm_area_struct *vma;
4536
4537
4538
4539
4540 if (!down_read_trylock(&mm->mmap_sem))
4541 return;
4542
4543 vma = find_vma(mm, ip);
4544 if (vma && vma->vm_file) {
4545 struct file *f = vma->vm_file;
4546 char *buf = (char *)__get_free_page(GFP_NOWAIT);
4547 if (buf) {
4548 char *p;
4549
4550 p = file_path(f, buf, PAGE_SIZE);
4551 if (IS_ERR(p))
4552 p = "?";
4553 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4554 vma->vm_start,
4555 vma->vm_end - vma->vm_start);
4556 free_page((unsigned long)buf);
4557 }
4558 }
4559 up_read(&mm->mmap_sem);
4560}
4561
4562#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4563void __might_fault(const char *file, int line)
4564{
4565
4566
4567
4568
4569
4570
4571 if (uaccess_kernel())
4572 return;
4573 if (pagefault_disabled())
4574 return;
4575 __might_sleep(file, line, 0);
4576#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4577 if (current->mm)
4578 might_lock_read(¤t->mm->mmap_sem);
4579#endif
4580}
4581EXPORT_SYMBOL(__might_fault);
4582#endif
4583
4584#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4585
4586
4587
4588
4589
4590static inline void process_huge_page(
4591 unsigned long addr_hint, unsigned int pages_per_huge_page,
4592 void (*process_subpage)(unsigned long addr, int idx, void *arg),
4593 void *arg)
4594{
4595 int i, n, base, l;
4596 unsigned long addr = addr_hint &
4597 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4598
4599
4600 might_sleep();
4601 n = (addr_hint - addr) / PAGE_SIZE;
4602 if (2 * n <= pages_per_huge_page) {
4603
4604 base = 0;
4605 l = n;
4606
4607 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
4608 cond_resched();
4609 process_subpage(addr + i * PAGE_SIZE, i, arg);
4610 }
4611 } else {
4612
4613 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
4614 l = pages_per_huge_page - n;
4615
4616 for (i = 0; i < base; i++) {
4617 cond_resched();
4618 process_subpage(addr + i * PAGE_SIZE, i, arg);
4619 }
4620 }
4621
4622
4623
4624
4625 for (i = 0; i < l; i++) {
4626 int left_idx = base + i;
4627 int right_idx = base + 2 * l - 1 - i;
4628
4629 cond_resched();
4630 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
4631 cond_resched();
4632 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
4633 }
4634}
4635
4636static void clear_gigantic_page(struct page *page,
4637 unsigned long addr,
4638 unsigned int pages_per_huge_page)
4639{
4640 int i;
4641 struct page *p = page;
4642
4643 might_sleep();
4644 for (i = 0; i < pages_per_huge_page;
4645 i++, p = mem_map_next(p, page, i)) {
4646 cond_resched();
4647 clear_user_highpage(p, addr + i * PAGE_SIZE);
4648 }
4649}
4650
4651static void clear_subpage(unsigned long addr, int idx, void *arg)
4652{
4653 struct page *page = arg;
4654
4655 clear_user_highpage(page + idx, addr);
4656}
4657
4658void clear_huge_page(struct page *page,
4659 unsigned long addr_hint, unsigned int pages_per_huge_page)
4660{
4661 unsigned long addr = addr_hint &
4662 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4663
4664 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4665 clear_gigantic_page(page, addr, pages_per_huge_page);
4666 return;
4667 }
4668
4669 process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
4670}
4671
4672static void copy_user_gigantic_page(struct page *dst, struct page *src,
4673 unsigned long addr,
4674 struct vm_area_struct *vma,
4675 unsigned int pages_per_huge_page)
4676{
4677 int i;
4678 struct page *dst_base = dst;
4679 struct page *src_base = src;
4680
4681 for (i = 0; i < pages_per_huge_page; ) {
4682 cond_resched();
4683 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4684
4685 i++;
4686 dst = mem_map_next(dst, dst_base, i);
4687 src = mem_map_next(src, src_base, i);
4688 }
4689}
4690
4691struct copy_subpage_arg {
4692 struct page *dst;
4693 struct page *src;
4694 struct vm_area_struct *vma;
4695};
4696
4697static void copy_subpage(unsigned long addr, int idx, void *arg)
4698{
4699 struct copy_subpage_arg *copy_arg = arg;
4700
4701 copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
4702 addr, copy_arg->vma);
4703}
4704
4705void copy_user_huge_page(struct page *dst, struct page *src,
4706 unsigned long addr_hint, struct vm_area_struct *vma,
4707 unsigned int pages_per_huge_page)
4708{
4709 unsigned long addr = addr_hint &
4710 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4711 struct copy_subpage_arg arg = {
4712 .dst = dst,
4713 .src = src,
4714 .vma = vma,
4715 };
4716
4717 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4718 copy_user_gigantic_page(dst, src, addr, vma,
4719 pages_per_huge_page);
4720 return;
4721 }
4722
4723 process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
4724}
4725
4726long copy_huge_page_from_user(struct page *dst_page,
4727 const void __user *usr_src,
4728 unsigned int pages_per_huge_page,
4729 bool allow_pagefault)
4730{
4731 void *src = (void *)usr_src;
4732 void *page_kaddr;
4733 unsigned long i, rc = 0;
4734 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
4735
4736 for (i = 0; i < pages_per_huge_page; i++) {
4737 if (allow_pagefault)
4738 page_kaddr = kmap(dst_page + i);
4739 else
4740 page_kaddr = kmap_atomic(dst_page + i);
4741 rc = copy_from_user(page_kaddr,
4742 (const void __user *)(src + i * PAGE_SIZE),
4743 PAGE_SIZE);
4744 if (allow_pagefault)
4745 kunmap(dst_page + i);
4746 else
4747 kunmap_atomic(page_kaddr);
4748
4749 ret_val -= (PAGE_SIZE - rc);
4750 if (rc)
4751 break;
4752
4753 cond_resched();
4754 }
4755 return ret_val;
4756}
4757#endif
4758
4759#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4760
4761static struct kmem_cache *page_ptl_cachep;
4762
4763void __init ptlock_cache_init(void)
4764{
4765 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4766 SLAB_PANIC, NULL);
4767}
4768
4769bool ptlock_alloc(struct page *page)
4770{
4771 spinlock_t *ptl;
4772
4773 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4774 if (!ptl)
4775 return false;
4776 page->ptl = ptl;
4777 return true;
4778}
4779
4780void ptlock_free(struct page *page)
4781{
4782 kmem_cache_free(page_ptl_cachep, page->ptl);
4783}
4784#endif
4785