1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42#include <linux/kernel_stat.h>
43#include <linux/mm.h>
44#include <linux/sched/mm.h>
45#include <linux/sched/coredump.h>
46#include <linux/sched/numa_balancing.h>
47#include <linux/sched/task.h>
48#include <linux/hugetlb.h>
49#include <linux/mman.h>
50#include <linux/swap.h>
51#include <linux/highmem.h>
52#include <linux/pagemap.h>
53#include <linux/memremap.h>
54#include <linux/ksm.h>
55#include <linux/rmap.h>
56#include <linux/export.h>
57#include <linux/delayacct.h>
58#include <linux/init.h>
59#include <linux/pfn_t.h>
60#include <linux/writeback.h>
61#include <linux/memcontrol.h>
62#include <linux/mmu_notifier.h>
63#include <linux/swapops.h>
64#include <linux/elf.h>
65#include <linux/gfp.h>
66#include <linux/migrate.h>
67#include <linux/string.h>
68#include <linux/debugfs.h>
69#include <linux/userfaultfd_k.h>
70#include <linux/dax.h>
71#include <linux/oom.h>
72#include <linux/numa.h>
73#include <linux/perf_event.h>
74#include <linux/ptrace.h>
75#include <linux/vmalloc.h>
76
77#include <trace/events/kmem.h>
78
79#include <asm/io.h>
80#include <asm/mmu_context.h>
81#include <asm/pgalloc.h>
82#include <linux/uaccess.h>
83#include <asm/tlb.h>
84#include <asm/tlbflush.h>
85
86#include "pgalloc-track.h"
87#include "internal.h"
88
89#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
90#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
91#endif
92
93#ifndef CONFIG_NEED_MULTIPLE_NODES
94
95unsigned long max_mapnr;
96EXPORT_SYMBOL(max_mapnr);
97
98struct page *mem_map;
99EXPORT_SYMBOL(mem_map);
100#endif
101
102
103
104
105
106
107
108
109void *high_memory;
110EXPORT_SYMBOL(high_memory);
111
112
113
114
115
116
117
118int randomize_va_space __read_mostly =
119#ifdef CONFIG_COMPAT_BRK
120 1;
121#else
122 2;
123#endif
124
125#ifndef arch_faults_on_old_pte
126static inline bool arch_faults_on_old_pte(void)
127{
128
129
130
131
132
133 return true;
134}
135#endif
136
137static int __init disable_randmaps(char *s)
138{
139 randomize_va_space = 0;
140 return 1;
141}
142__setup("norandmaps", disable_randmaps);
143
144unsigned long zero_pfn __read_mostly;
145EXPORT_SYMBOL(zero_pfn);
146
147unsigned long highest_memmap_pfn __read_mostly;
148
149
150
151
152static int __init init_zero_pfn(void)
153{
154 zero_pfn = page_to_pfn(ZERO_PAGE(0));
155 return 0;
156}
157core_initcall(init_zero_pfn);
158
159void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
160{
161 trace_rss_stat(mm, member, count);
162}
163
164#if defined(SPLIT_RSS_COUNTING)
165
166void sync_mm_rss(struct mm_struct *mm)
167{
168 int i;
169
170 for (i = 0; i < NR_MM_COUNTERS; i++) {
171 if (current->rss_stat.count[i]) {
172 add_mm_counter(mm, i, current->rss_stat.count[i]);
173 current->rss_stat.count[i] = 0;
174 }
175 }
176 current->rss_stat.events = 0;
177}
178
179static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
180{
181 struct task_struct *task = current;
182
183 if (likely(task->mm == mm))
184 task->rss_stat.count[member] += val;
185 else
186 add_mm_counter(mm, member, val);
187}
188#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
189#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
190
191
192#define TASK_RSS_EVENTS_THRESH (64)
193static void check_sync_rss_stat(struct task_struct *task)
194{
195 if (unlikely(task != current))
196 return;
197 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
198 sync_mm_rss(task->mm);
199}
200#else
201
202#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
203#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
204
205static void check_sync_rss_stat(struct task_struct *task)
206{
207}
208
209#endif
210
211
212
213
214
215static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
216 unsigned long addr)
217{
218 pgtable_t token = pmd_pgtable(*pmd);
219 pmd_clear(pmd);
220 pte_free_tlb(tlb, token, addr);
221 mm_dec_nr_ptes(tlb->mm);
222}
223
224static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
225 unsigned long addr, unsigned long end,
226 unsigned long floor, unsigned long ceiling)
227{
228 pmd_t *pmd;
229 unsigned long next;
230 unsigned long start;
231
232 start = addr;
233 pmd = pmd_offset(pud, addr);
234 do {
235 next = pmd_addr_end(addr, end);
236 if (pmd_none_or_clear_bad(pmd))
237 continue;
238 free_pte_range(tlb, pmd, addr);
239 } while (pmd++, addr = next, addr != end);
240
241 start &= PUD_MASK;
242 if (start < floor)
243 return;
244 if (ceiling) {
245 ceiling &= PUD_MASK;
246 if (!ceiling)
247 return;
248 }
249 if (end - 1 > ceiling - 1)
250 return;
251
252 pmd = pmd_offset(pud, start);
253 pud_clear(pud);
254 pmd_free_tlb(tlb, pmd, start);
255 mm_dec_nr_pmds(tlb->mm);
256}
257
258static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
259 unsigned long addr, unsigned long end,
260 unsigned long floor, unsigned long ceiling)
261{
262 pud_t *pud;
263 unsigned long next;
264 unsigned long start;
265
266 start = addr;
267 pud = pud_offset(p4d, addr);
268 do {
269 next = pud_addr_end(addr, end);
270 if (pud_none_or_clear_bad(pud))
271 continue;
272 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
273 } while (pud++, addr = next, addr != end);
274
275 start &= P4D_MASK;
276 if (start < floor)
277 return;
278 if (ceiling) {
279 ceiling &= P4D_MASK;
280 if (!ceiling)
281 return;
282 }
283 if (end - 1 > ceiling - 1)
284 return;
285
286 pud = pud_offset(p4d, start);
287 p4d_clear(p4d);
288 pud_free_tlb(tlb, pud, start);
289 mm_dec_nr_puds(tlb->mm);
290}
291
292static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
293 unsigned long addr, unsigned long end,
294 unsigned long floor, unsigned long ceiling)
295{
296 p4d_t *p4d;
297 unsigned long next;
298 unsigned long start;
299
300 start = addr;
301 p4d = p4d_offset(pgd, addr);
302 do {
303 next = p4d_addr_end(addr, end);
304 if (p4d_none_or_clear_bad(p4d))
305 continue;
306 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
307 } while (p4d++, addr = next, addr != end);
308
309 start &= PGDIR_MASK;
310 if (start < floor)
311 return;
312 if (ceiling) {
313 ceiling &= PGDIR_MASK;
314 if (!ceiling)
315 return;
316 }
317 if (end - 1 > ceiling - 1)
318 return;
319
320 p4d = p4d_offset(pgd, start);
321 pgd_clear(pgd);
322 p4d_free_tlb(tlb, p4d, start);
323}
324
325
326
327
328void free_pgd_range(struct mmu_gather *tlb,
329 unsigned long addr, unsigned long end,
330 unsigned long floor, unsigned long ceiling)
331{
332 pgd_t *pgd;
333 unsigned long next;
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361 addr &= PMD_MASK;
362 if (addr < floor) {
363 addr += PMD_SIZE;
364 if (!addr)
365 return;
366 }
367 if (ceiling) {
368 ceiling &= PMD_MASK;
369 if (!ceiling)
370 return;
371 }
372 if (end - 1 > ceiling - 1)
373 end -= PMD_SIZE;
374 if (addr > end - 1)
375 return;
376
377
378
379
380 tlb_change_page_size(tlb, PAGE_SIZE);
381 pgd = pgd_offset(tlb->mm, addr);
382 do {
383 next = pgd_addr_end(addr, end);
384 if (pgd_none_or_clear_bad(pgd))
385 continue;
386 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
387 } while (pgd++, addr = next, addr != end);
388}
389
390void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
391 unsigned long floor, unsigned long ceiling)
392{
393 while (vma) {
394 struct vm_area_struct *next = vma->vm_next;
395 unsigned long addr = vma->vm_start;
396
397
398
399
400
401 unlink_anon_vmas(vma);
402 unlink_file_vma(vma);
403
404 if (is_vm_hugetlb_page(vma)) {
405 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
406 floor, next ? next->vm_start : ceiling);
407 } else {
408
409
410
411 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
412 && !is_vm_hugetlb_page(next)) {
413 vma = next;
414 next = vma->vm_next;
415 unlink_anon_vmas(vma);
416 unlink_file_vma(vma);
417 }
418 free_pgd_range(tlb, addr, vma->vm_end,
419 floor, next ? next->vm_start : ceiling);
420 }
421 vma = next;
422 }
423}
424
425int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
426{
427 spinlock_t *ptl;
428 pgtable_t new = pte_alloc_one(mm);
429 if (!new)
430 return -ENOMEM;
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445 smp_wmb();
446
447 ptl = pmd_lock(mm, pmd);
448 if (likely(pmd_none(*pmd))) {
449 mm_inc_nr_ptes(mm);
450 pmd_populate(mm, pmd, new);
451 new = NULL;
452 }
453 spin_unlock(ptl);
454 if (new)
455 pte_free(mm, new);
456 return 0;
457}
458
459int __pte_alloc_kernel(pmd_t *pmd)
460{
461 pte_t *new = pte_alloc_one_kernel(&init_mm);
462 if (!new)
463 return -ENOMEM;
464
465 smp_wmb();
466
467 spin_lock(&init_mm.page_table_lock);
468 if (likely(pmd_none(*pmd))) {
469 pmd_populate_kernel(&init_mm, pmd, new);
470 new = NULL;
471 }
472 spin_unlock(&init_mm.page_table_lock);
473 if (new)
474 pte_free_kernel(&init_mm, new);
475 return 0;
476}
477
478static inline void init_rss_vec(int *rss)
479{
480 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
481}
482
483static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
484{
485 int i;
486
487 if (current->mm == mm)
488 sync_mm_rss(mm);
489 for (i = 0; i < NR_MM_COUNTERS; i++)
490 if (rss[i])
491 add_mm_counter(mm, i, rss[i]);
492}
493
494
495
496
497
498
499
500
501static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
502 pte_t pte, struct page *page)
503{
504 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
505 p4d_t *p4d = p4d_offset(pgd, addr);
506 pud_t *pud = pud_offset(p4d, addr);
507 pmd_t *pmd = pmd_offset(pud, addr);
508 struct address_space *mapping;
509 pgoff_t index;
510 static unsigned long resume;
511 static unsigned long nr_shown;
512 static unsigned long nr_unshown;
513
514
515
516
517
518 if (nr_shown == 60) {
519 if (time_before(jiffies, resume)) {
520 nr_unshown++;
521 return;
522 }
523 if (nr_unshown) {
524 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
525 nr_unshown);
526 nr_unshown = 0;
527 }
528 nr_shown = 0;
529 }
530 if (nr_shown++ == 0)
531 resume = jiffies + 60 * HZ;
532
533 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
534 index = linear_page_index(vma, addr);
535
536 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
537 current->comm,
538 (long long)pte_val(pte), (long long)pmd_val(*pmd));
539 if (page)
540 dump_page(page, "bad pte");
541 pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
542 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
543 pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
544 vma->vm_file,
545 vma->vm_ops ? vma->vm_ops->fault : NULL,
546 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
547 mapping ? mapping->a_ops->readpage : NULL);
548 dump_stack();
549 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
550}
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
595 pte_t pte)
596{
597 unsigned long pfn = pte_pfn(pte);
598
599 if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
600 if (likely(!pte_special(pte)))
601 goto check_pfn;
602 if (vma->vm_ops && vma->vm_ops->find_special_page)
603 return vma->vm_ops->find_special_page(vma, addr);
604 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
605 return NULL;
606 if (is_zero_pfn(pfn))
607 return NULL;
608 if (pte_devmap(pte))
609 return NULL;
610
611 print_bad_pte(vma, addr, pte, NULL);
612 return NULL;
613 }
614
615
616
617 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
618 if (vma->vm_flags & VM_MIXEDMAP) {
619 if (!pfn_valid(pfn))
620 return NULL;
621 goto out;
622 } else {
623 unsigned long off;
624 off = (addr - vma->vm_start) >> PAGE_SHIFT;
625 if (pfn == vma->vm_pgoff + off)
626 return NULL;
627 if (!is_cow_mapping(vma->vm_flags))
628 return NULL;
629 }
630 }
631
632 if (is_zero_pfn(pfn))
633 return NULL;
634
635check_pfn:
636 if (unlikely(pfn > highest_memmap_pfn)) {
637 print_bad_pte(vma, addr, pte, NULL);
638 return NULL;
639 }
640
641
642
643
644
645out:
646 return pfn_to_page(pfn);
647}
648
649#ifdef CONFIG_TRANSPARENT_HUGEPAGE
650struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
651 pmd_t pmd)
652{
653 unsigned long pfn = pmd_pfn(pmd);
654
655
656
657
658
659
660 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
661 if (vma->vm_flags & VM_MIXEDMAP) {
662 if (!pfn_valid(pfn))
663 return NULL;
664 goto out;
665 } else {
666 unsigned long off;
667 off = (addr - vma->vm_start) >> PAGE_SHIFT;
668 if (pfn == vma->vm_pgoff + off)
669 return NULL;
670 if (!is_cow_mapping(vma->vm_flags))
671 return NULL;
672 }
673 }
674
675 if (pmd_devmap(pmd))
676 return NULL;
677 if (is_huge_zero_pmd(pmd))
678 return NULL;
679 if (unlikely(pfn > highest_memmap_pfn))
680 return NULL;
681
682
683
684
685
686out:
687 return pfn_to_page(pfn);
688}
689#endif
690
691
692
693
694
695
696
697static unsigned long
698copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
699 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
700 unsigned long addr, int *rss)
701{
702 unsigned long vm_flags = vma->vm_flags;
703 pte_t pte = *src_pte;
704 struct page *page;
705 swp_entry_t entry = pte_to_swp_entry(pte);
706
707 if (likely(!non_swap_entry(entry))) {
708 if (swap_duplicate(entry) < 0)
709 return entry.val;
710
711
712 if (unlikely(list_empty(&dst_mm->mmlist))) {
713 spin_lock(&mmlist_lock);
714 if (list_empty(&dst_mm->mmlist))
715 list_add(&dst_mm->mmlist,
716 &src_mm->mmlist);
717 spin_unlock(&mmlist_lock);
718 }
719 rss[MM_SWAPENTS]++;
720 } else if (is_migration_entry(entry)) {
721 page = migration_entry_to_page(entry);
722
723 rss[mm_counter(page)]++;
724
725 if (is_write_migration_entry(entry) &&
726 is_cow_mapping(vm_flags)) {
727
728
729
730
731 make_migration_entry_read(&entry);
732 pte = swp_entry_to_pte(entry);
733 if (pte_swp_soft_dirty(*src_pte))
734 pte = pte_swp_mksoft_dirty(pte);
735 if (pte_swp_uffd_wp(*src_pte))
736 pte = pte_swp_mkuffd_wp(pte);
737 set_pte_at(src_mm, addr, src_pte, pte);
738 }
739 } else if (is_device_private_entry(entry)) {
740 page = device_private_entry_to_page(entry);
741
742
743
744
745
746
747
748
749
750
751 get_page(page);
752 rss[mm_counter(page)]++;
753 page_dup_rmap(page, false);
754
755
756
757
758
759
760
761
762 if (is_write_device_private_entry(entry) &&
763 is_cow_mapping(vm_flags)) {
764 make_device_private_entry_read(&entry);
765 pte = swp_entry_to_pte(entry);
766 if (pte_swp_uffd_wp(*src_pte))
767 pte = pte_swp_mkuffd_wp(pte);
768 set_pte_at(src_mm, addr, src_pte, pte);
769 }
770 }
771 set_pte_at(dst_mm, addr, dst_pte, pte);
772 return 0;
773}
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795static inline int
796copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
797 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
798 struct page **prealloc, pte_t pte, struct page *page)
799{
800 struct mm_struct *src_mm = src_vma->vm_mm;
801 struct page *new_page;
802
803 if (!is_cow_mapping(src_vma->vm_flags))
804 return 1;
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819 if (likely(!atomic_read(&src_mm->has_pinned)))
820 return 1;
821 if (likely(!page_maybe_dma_pinned(page)))
822 return 1;
823
824 new_page = *prealloc;
825 if (!new_page)
826 return -EAGAIN;
827
828
829
830
831
832 *prealloc = NULL;
833 copy_user_highpage(new_page, page, addr, src_vma);
834 __SetPageUptodate(new_page);
835 page_add_new_anon_rmap(new_page, dst_vma, addr, false);
836 lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
837 rss[mm_counter(new_page)]++;
838
839
840 pte = mk_pte(new_page, dst_vma->vm_page_prot);
841 pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
842 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
843 return 0;
844}
845
846
847
848
849
850static inline int
851copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
852 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
853 struct page **prealloc)
854{
855 struct mm_struct *src_mm = src_vma->vm_mm;
856 unsigned long vm_flags = src_vma->vm_flags;
857 pte_t pte = *src_pte;
858 struct page *page;
859
860 page = vm_normal_page(src_vma, addr, pte);
861 if (page) {
862 int retval;
863
864 retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
865 addr, rss, prealloc, pte, page);
866 if (retval <= 0)
867 return retval;
868
869 get_page(page);
870 page_dup_rmap(page, false);
871 rss[mm_counter(page)]++;
872 }
873
874
875
876
877
878 if (is_cow_mapping(vm_flags) && pte_write(pte)) {
879 ptep_set_wrprotect(src_mm, addr, src_pte);
880 pte = pte_wrprotect(pte);
881 }
882
883
884
885
886
887 if (vm_flags & VM_SHARED)
888 pte = pte_mkclean(pte);
889 pte = pte_mkold(pte);
890
891
892
893
894
895
896 if (!(vm_flags & VM_UFFD_WP))
897 pte = pte_clear_uffd_wp(pte);
898
899 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
900 return 0;
901}
902
903static inline struct page *
904page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
905 unsigned long addr)
906{
907 struct page *new_page;
908
909 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
910 if (!new_page)
911 return NULL;
912
913 if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
914 put_page(new_page);
915 return NULL;
916 }
917 cgroup_throttle_swaprate(new_page, GFP_KERNEL);
918
919 return new_page;
920}
921
922static int
923copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
924 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
925 unsigned long end)
926{
927 struct mm_struct *dst_mm = dst_vma->vm_mm;
928 struct mm_struct *src_mm = src_vma->vm_mm;
929 pte_t *orig_src_pte, *orig_dst_pte;
930 pte_t *src_pte, *dst_pte;
931 spinlock_t *src_ptl, *dst_ptl;
932 int progress, ret = 0;
933 int rss[NR_MM_COUNTERS];
934 swp_entry_t entry = (swp_entry_t){0};
935 struct page *prealloc = NULL;
936
937again:
938 progress = 0;
939 init_rss_vec(rss);
940
941 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
942 if (!dst_pte) {
943 ret = -ENOMEM;
944 goto out;
945 }
946 src_pte = pte_offset_map(src_pmd, addr);
947 src_ptl = pte_lockptr(src_mm, src_pmd);
948 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
949 orig_src_pte = src_pte;
950 orig_dst_pte = dst_pte;
951 arch_enter_lazy_mmu_mode();
952
953 do {
954
955
956
957
958 if (progress >= 32) {
959 progress = 0;
960 if (need_resched() ||
961 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
962 break;
963 }
964 if (pte_none(*src_pte)) {
965 progress++;
966 continue;
967 }
968 if (unlikely(!pte_present(*src_pte))) {
969 entry.val = copy_nonpresent_pte(dst_mm, src_mm,
970 dst_pte, src_pte,
971 src_vma, addr, rss);
972 if (entry.val)
973 break;
974 progress += 8;
975 continue;
976 }
977
978 ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
979 addr, rss, &prealloc);
980
981
982
983
984 if (unlikely(ret == -EAGAIN))
985 break;
986 if (unlikely(prealloc)) {
987
988
989
990
991
992
993 put_page(prealloc);
994 prealloc = NULL;
995 }
996 progress += 8;
997 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
998
999 arch_leave_lazy_mmu_mode();
1000 spin_unlock(src_ptl);
1001 pte_unmap(orig_src_pte);
1002 add_mm_rss_vec(dst_mm, rss);
1003 pte_unmap_unlock(orig_dst_pte, dst_ptl);
1004 cond_resched();
1005
1006 if (entry.val) {
1007 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
1008 ret = -ENOMEM;
1009 goto out;
1010 }
1011 entry.val = 0;
1012 } else if (ret) {
1013 WARN_ON_ONCE(ret != -EAGAIN);
1014 prealloc = page_copy_prealloc(src_mm, src_vma, addr);
1015 if (!prealloc)
1016 return -ENOMEM;
1017
1018 ret = 0;
1019 }
1020 if (addr != end)
1021 goto again;
1022out:
1023 if (unlikely(prealloc))
1024 put_page(prealloc);
1025 return ret;
1026}
1027
1028static inline int
1029copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1030 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1031 unsigned long end)
1032{
1033 struct mm_struct *dst_mm = dst_vma->vm_mm;
1034 struct mm_struct *src_mm = src_vma->vm_mm;
1035 pmd_t *src_pmd, *dst_pmd;
1036 unsigned long next;
1037
1038 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1039 if (!dst_pmd)
1040 return -ENOMEM;
1041 src_pmd = pmd_offset(src_pud, addr);
1042 do {
1043 next = pmd_addr_end(addr, end);
1044 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
1045 || pmd_devmap(*src_pmd)) {
1046 int err;
1047 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
1048 err = copy_huge_pmd(dst_mm, src_mm,
1049 dst_pmd, src_pmd, addr, src_vma);
1050 if (err == -ENOMEM)
1051 return -ENOMEM;
1052 if (!err)
1053 continue;
1054
1055 }
1056 if (pmd_none_or_clear_bad(src_pmd))
1057 continue;
1058 if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
1059 addr, next))
1060 return -ENOMEM;
1061 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1062 return 0;
1063}
1064
1065static inline int
1066copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1067 p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
1068 unsigned long end)
1069{
1070 struct mm_struct *dst_mm = dst_vma->vm_mm;
1071 struct mm_struct *src_mm = src_vma->vm_mm;
1072 pud_t *src_pud, *dst_pud;
1073 unsigned long next;
1074
1075 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1076 if (!dst_pud)
1077 return -ENOMEM;
1078 src_pud = pud_offset(src_p4d, addr);
1079 do {
1080 next = pud_addr_end(addr, end);
1081 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1082 int err;
1083
1084 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1085 err = copy_huge_pud(dst_mm, src_mm,
1086 dst_pud, src_pud, addr, src_vma);
1087 if (err == -ENOMEM)
1088 return -ENOMEM;
1089 if (!err)
1090 continue;
1091
1092 }
1093 if (pud_none_or_clear_bad(src_pud))
1094 continue;
1095 if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
1096 addr, next))
1097 return -ENOMEM;
1098 } while (dst_pud++, src_pud++, addr = next, addr != end);
1099 return 0;
1100}
1101
1102static inline int
1103copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1104 pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
1105 unsigned long end)
1106{
1107 struct mm_struct *dst_mm = dst_vma->vm_mm;
1108 p4d_t *src_p4d, *dst_p4d;
1109 unsigned long next;
1110
1111 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1112 if (!dst_p4d)
1113 return -ENOMEM;
1114 src_p4d = p4d_offset(src_pgd, addr);
1115 do {
1116 next = p4d_addr_end(addr, end);
1117 if (p4d_none_or_clear_bad(src_p4d))
1118 continue;
1119 if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
1120 addr, next))
1121 return -ENOMEM;
1122 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
1123 return 0;
1124}
1125
1126int
1127copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1128{
1129 pgd_t *src_pgd, *dst_pgd;
1130 unsigned long next;
1131 unsigned long addr = src_vma->vm_start;
1132 unsigned long end = src_vma->vm_end;
1133 struct mm_struct *dst_mm = dst_vma->vm_mm;
1134 struct mm_struct *src_mm = src_vma->vm_mm;
1135 struct mmu_notifier_range range;
1136 bool is_cow;
1137 int ret;
1138
1139
1140
1141
1142
1143
1144
1145 if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1146 !src_vma->anon_vma)
1147 return 0;
1148
1149 if (is_vm_hugetlb_page(src_vma))
1150 return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
1151
1152 if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1153
1154
1155
1156
1157 ret = track_pfn_copy(src_vma);
1158 if (ret)
1159 return ret;
1160 }
1161
1162
1163
1164
1165
1166
1167
1168 is_cow = is_cow_mapping(src_vma->vm_flags);
1169
1170 if (is_cow) {
1171 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
1172 0, src_vma, src_mm, addr, end);
1173 mmu_notifier_invalidate_range_start(&range);
1174 }
1175
1176 ret = 0;
1177 dst_pgd = pgd_offset(dst_mm, addr);
1178 src_pgd = pgd_offset(src_mm, addr);
1179 do {
1180 next = pgd_addr_end(addr, end);
1181 if (pgd_none_or_clear_bad(src_pgd))
1182 continue;
1183 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
1184 addr, next))) {
1185 ret = -ENOMEM;
1186 break;
1187 }
1188 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1189
1190 if (is_cow)
1191 mmu_notifier_invalidate_range_end(&range);
1192 return ret;
1193}
1194
1195static unsigned long zap_pte_range(struct mmu_gather *tlb,
1196 struct vm_area_struct *vma, pmd_t *pmd,
1197 unsigned long addr, unsigned long end,
1198 struct zap_details *details)
1199{
1200 struct mm_struct *mm = tlb->mm;
1201 int force_flush = 0;
1202 int rss[NR_MM_COUNTERS];
1203 spinlock_t *ptl;
1204 pte_t *start_pte;
1205 pte_t *pte;
1206 swp_entry_t entry;
1207
1208 tlb_change_page_size(tlb, PAGE_SIZE);
1209again:
1210 init_rss_vec(rss);
1211 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1212 pte = start_pte;
1213 flush_tlb_batched_pending(mm);
1214 arch_enter_lazy_mmu_mode();
1215 do {
1216 pte_t ptent = *pte;
1217 if (pte_none(ptent))
1218 continue;
1219
1220 if (need_resched())
1221 break;
1222
1223 if (pte_present(ptent)) {
1224 struct page *page;
1225
1226 page = vm_normal_page(vma, addr, ptent);
1227 if (unlikely(details) && page) {
1228
1229
1230
1231
1232
1233 if (details->check_mapping &&
1234 details->check_mapping != page_rmapping(page))
1235 continue;
1236 }
1237 ptent = ptep_get_and_clear_full(mm, addr, pte,
1238 tlb->fullmm);
1239 tlb_remove_tlb_entry(tlb, pte, addr);
1240 if (unlikely(!page))
1241 continue;
1242
1243 if (!PageAnon(page)) {
1244 if (pte_dirty(ptent)) {
1245 force_flush = 1;
1246 set_page_dirty(page);
1247 }
1248 if (pte_young(ptent) &&
1249 likely(!(vma->vm_flags & VM_SEQ_READ)))
1250 mark_page_accessed(page);
1251 }
1252 rss[mm_counter(page)]--;
1253 page_remove_rmap(page, false);
1254 if (unlikely(page_mapcount(page) < 0))
1255 print_bad_pte(vma, addr, ptent, page);
1256 if (unlikely(__tlb_remove_page(tlb, page))) {
1257 force_flush = 1;
1258 addr += PAGE_SIZE;
1259 break;
1260 }
1261 continue;
1262 }
1263
1264 entry = pte_to_swp_entry(ptent);
1265 if (is_device_private_entry(entry)) {
1266 struct page *page = device_private_entry_to_page(entry);
1267
1268 if (unlikely(details && details->check_mapping)) {
1269
1270
1271
1272
1273
1274 if (details->check_mapping !=
1275 page_rmapping(page))
1276 continue;
1277 }
1278
1279 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1280 rss[mm_counter(page)]--;
1281 page_remove_rmap(page, false);
1282 put_page(page);
1283 continue;
1284 }
1285
1286
1287 if (unlikely(details))
1288 continue;
1289
1290 if (!non_swap_entry(entry))
1291 rss[MM_SWAPENTS]--;
1292 else if (is_migration_entry(entry)) {
1293 struct page *page;
1294
1295 page = migration_entry_to_page(entry);
1296 rss[mm_counter(page)]--;
1297 }
1298 if (unlikely(!free_swap_and_cache(entry)))
1299 print_bad_pte(vma, addr, ptent, NULL);
1300 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1301 } while (pte++, addr += PAGE_SIZE, addr != end);
1302
1303 add_mm_rss_vec(mm, rss);
1304 arch_leave_lazy_mmu_mode();
1305
1306
1307 if (force_flush)
1308 tlb_flush_mmu_tlbonly(tlb);
1309 pte_unmap_unlock(start_pte, ptl);
1310
1311
1312
1313
1314
1315
1316
1317 if (force_flush) {
1318 force_flush = 0;
1319 tlb_flush_mmu(tlb);
1320 }
1321
1322 if (addr != end) {
1323 cond_resched();
1324 goto again;
1325 }
1326
1327 return addr;
1328}
1329
1330static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1331 struct vm_area_struct *vma, pud_t *pud,
1332 unsigned long addr, unsigned long end,
1333 struct zap_details *details)
1334{
1335 pmd_t *pmd;
1336 unsigned long next;
1337
1338 pmd = pmd_offset(pud, addr);
1339 do {
1340 next = pmd_addr_end(addr, end);
1341 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1342 if (next - addr != HPAGE_PMD_SIZE)
1343 __split_huge_pmd(vma, pmd, addr, false, NULL);
1344 else if (zap_huge_pmd(tlb, vma, pmd, addr))
1345 goto next;
1346
1347 }
1348
1349
1350
1351
1352
1353
1354
1355 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1356 goto next;
1357 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1358next:
1359 cond_resched();
1360 } while (pmd++, addr = next, addr != end);
1361
1362 return addr;
1363}
1364
1365static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1366 struct vm_area_struct *vma, p4d_t *p4d,
1367 unsigned long addr, unsigned long end,
1368 struct zap_details *details)
1369{
1370 pud_t *pud;
1371 unsigned long next;
1372
1373 pud = pud_offset(p4d, addr);
1374 do {
1375 next = pud_addr_end(addr, end);
1376 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1377 if (next - addr != HPAGE_PUD_SIZE) {
1378 mmap_assert_locked(tlb->mm);
1379 split_huge_pud(vma, pud, addr);
1380 } else if (zap_huge_pud(tlb, vma, pud, addr))
1381 goto next;
1382
1383 }
1384 if (pud_none_or_clear_bad(pud))
1385 continue;
1386 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1387next:
1388 cond_resched();
1389 } while (pud++, addr = next, addr != end);
1390
1391 return addr;
1392}
1393
1394static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1395 struct vm_area_struct *vma, pgd_t *pgd,
1396 unsigned long addr, unsigned long end,
1397 struct zap_details *details)
1398{
1399 p4d_t *p4d;
1400 unsigned long next;
1401
1402 p4d = p4d_offset(pgd, addr);
1403 do {
1404 next = p4d_addr_end(addr, end);
1405 if (p4d_none_or_clear_bad(p4d))
1406 continue;
1407 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1408 } while (p4d++, addr = next, addr != end);
1409
1410 return addr;
1411}
1412
1413void unmap_page_range(struct mmu_gather *tlb,
1414 struct vm_area_struct *vma,
1415 unsigned long addr, unsigned long end,
1416 struct zap_details *details)
1417{
1418 pgd_t *pgd;
1419 unsigned long next;
1420
1421 BUG_ON(addr >= end);
1422 tlb_start_vma(tlb, vma);
1423 pgd = pgd_offset(vma->vm_mm, addr);
1424 do {
1425 next = pgd_addr_end(addr, end);
1426 if (pgd_none_or_clear_bad(pgd))
1427 continue;
1428 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1429 } while (pgd++, addr = next, addr != end);
1430 tlb_end_vma(tlb, vma);
1431}
1432
1433
1434static void unmap_single_vma(struct mmu_gather *tlb,
1435 struct vm_area_struct *vma, unsigned long start_addr,
1436 unsigned long end_addr,
1437 struct zap_details *details)
1438{
1439 unsigned long start = max(vma->vm_start, start_addr);
1440 unsigned long end;
1441
1442 if (start >= vma->vm_end)
1443 return;
1444 end = min(vma->vm_end, end_addr);
1445 if (end <= vma->vm_start)
1446 return;
1447
1448 if (vma->vm_file)
1449 uprobe_munmap(vma, start, end);
1450
1451 if (unlikely(vma->vm_flags & VM_PFNMAP))
1452 untrack_pfn(vma, 0, 0);
1453
1454 if (start != end) {
1455 if (unlikely(is_vm_hugetlb_page(vma))) {
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467 if (vma->vm_file) {
1468 i_mmap_lock_write(vma->vm_file->f_mapping);
1469 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1470 i_mmap_unlock_write(vma->vm_file->f_mapping);
1471 }
1472 } else
1473 unmap_page_range(tlb, vma, start, end, details);
1474 }
1475}
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495void unmap_vmas(struct mmu_gather *tlb,
1496 struct vm_area_struct *vma, unsigned long start_addr,
1497 unsigned long end_addr)
1498{
1499 struct mmu_notifier_range range;
1500
1501 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
1502 start_addr, end_addr);
1503 mmu_notifier_invalidate_range_start(&range);
1504 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1505 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1506 mmu_notifier_invalidate_range_end(&range);
1507}
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1518 unsigned long size)
1519{
1520 struct mmu_notifier_range range;
1521 struct mmu_gather tlb;
1522
1523 lru_add_drain();
1524 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1525 start, start + size);
1526 tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
1527 update_hiwater_rss(vma->vm_mm);
1528 mmu_notifier_invalidate_range_start(&range);
1529 for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
1530 unmap_single_vma(&tlb, vma, start, range.end, NULL);
1531 mmu_notifier_invalidate_range_end(&range);
1532 tlb_finish_mmu(&tlb, start, range.end);
1533}
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1545 unsigned long size, struct zap_details *details)
1546{
1547 struct mmu_notifier_range range;
1548 struct mmu_gather tlb;
1549
1550 lru_add_drain();
1551 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1552 address, address + size);
1553 tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
1554 update_hiwater_rss(vma->vm_mm);
1555 mmu_notifier_invalidate_range_start(&range);
1556 unmap_single_vma(&tlb, vma, address, range.end, details);
1557 mmu_notifier_invalidate_range_end(&range);
1558 tlb_finish_mmu(&tlb, address, range.end);
1559}
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1573 unsigned long size)
1574{
1575 if (address < vma->vm_start || address + size > vma->vm_end ||
1576 !(vma->vm_flags & VM_PFNMAP))
1577 return;
1578
1579 zap_page_range_single(vma, address, size, NULL);
1580}
1581EXPORT_SYMBOL_GPL(zap_vma_ptes);
1582
1583static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
1584{
1585 pgd_t *pgd;
1586 p4d_t *p4d;
1587 pud_t *pud;
1588 pmd_t *pmd;
1589
1590 pgd = pgd_offset(mm, addr);
1591 p4d = p4d_alloc(mm, pgd, addr);
1592 if (!p4d)
1593 return NULL;
1594 pud = pud_alloc(mm, p4d, addr);
1595 if (!pud)
1596 return NULL;
1597 pmd = pmd_alloc(mm, pud, addr);
1598 if (!pmd)
1599 return NULL;
1600
1601 VM_BUG_ON(pmd_trans_huge(*pmd));
1602 return pmd;
1603}
1604
1605pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1606 spinlock_t **ptl)
1607{
1608 pmd_t *pmd = walk_to_pmd(mm, addr);
1609
1610 if (!pmd)
1611 return NULL;
1612 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1613}
1614
1615static int validate_page_before_insert(struct page *page)
1616{
1617 if (PageAnon(page) || PageSlab(page) || page_has_type(page))
1618 return -EINVAL;
1619 flush_dcache_page(page);
1620 return 0;
1621}
1622
1623static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
1624 unsigned long addr, struct page *page, pgprot_t prot)
1625{
1626 if (!pte_none(*pte))
1627 return -EBUSY;
1628
1629 get_page(page);
1630 inc_mm_counter_fast(mm, mm_counter_file(page));
1631 page_add_file_rmap(page, false);
1632 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1633 return 0;
1634}
1635
1636
1637
1638
1639
1640
1641
1642
1643static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1644 struct page *page, pgprot_t prot)
1645{
1646 struct mm_struct *mm = vma->vm_mm;
1647 int retval;
1648 pte_t *pte;
1649 spinlock_t *ptl;
1650
1651 retval = validate_page_before_insert(page);
1652 if (retval)
1653 goto out;
1654 retval = -ENOMEM;
1655 pte = get_locked_pte(mm, addr, &ptl);
1656 if (!pte)
1657 goto out;
1658 retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
1659 pte_unmap_unlock(pte, ptl);
1660out:
1661 return retval;
1662}
1663
1664#ifdef pte_index
1665static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
1666 unsigned long addr, struct page *page, pgprot_t prot)
1667{
1668 int err;
1669
1670 if (!page_count(page))
1671 return -EINVAL;
1672 err = validate_page_before_insert(page);
1673 if (err)
1674 return err;
1675 return insert_page_into_pte_locked(mm, pte, addr, page, prot);
1676}
1677
1678
1679
1680
1681static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
1682 struct page **pages, unsigned long *num, pgprot_t prot)
1683{
1684 pmd_t *pmd = NULL;
1685 pte_t *start_pte, *pte;
1686 spinlock_t *pte_lock;
1687 struct mm_struct *const mm = vma->vm_mm;
1688 unsigned long curr_page_idx = 0;
1689 unsigned long remaining_pages_total = *num;
1690 unsigned long pages_to_write_in_pmd;
1691 int ret;
1692more:
1693 ret = -EFAULT;
1694 pmd = walk_to_pmd(mm, addr);
1695 if (!pmd)
1696 goto out;
1697
1698 pages_to_write_in_pmd = min_t(unsigned long,
1699 remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
1700
1701
1702 ret = -ENOMEM;
1703 if (pte_alloc(mm, pmd))
1704 goto out;
1705
1706 while (pages_to_write_in_pmd) {
1707 int pte_idx = 0;
1708 const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
1709
1710 start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
1711 for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
1712 int err = insert_page_in_batch_locked(mm, pte,
1713 addr, pages[curr_page_idx], prot);
1714 if (unlikely(err)) {
1715 pte_unmap_unlock(start_pte, pte_lock);
1716 ret = err;
1717 remaining_pages_total -= pte_idx;
1718 goto out;
1719 }
1720 addr += PAGE_SIZE;
1721 ++curr_page_idx;
1722 }
1723 pte_unmap_unlock(start_pte, pte_lock);
1724 pages_to_write_in_pmd -= batch_size;
1725 remaining_pages_total -= batch_size;
1726 }
1727 if (remaining_pages_total)
1728 goto more;
1729 ret = 0;
1730out:
1731 *num = remaining_pages_total;
1732 return ret;
1733}
1734#endif
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
1752 struct page **pages, unsigned long *num)
1753{
1754#ifdef pte_index
1755 const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
1756
1757 if (addr < vma->vm_start || end_addr >= vma->vm_end)
1758 return -EFAULT;
1759 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1760 BUG_ON(mmap_read_trylock(vma->vm_mm));
1761 BUG_ON(vma->vm_flags & VM_PFNMAP);
1762 vma->vm_flags |= VM_MIXEDMAP;
1763 }
1764
1765 return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
1766#else
1767 unsigned long idx = 0, pgcount = *num;
1768 int err = -EINVAL;
1769
1770 for (; idx < pgcount; ++idx) {
1771 err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
1772 if (err)
1773 break;
1774 }
1775 *num = pgcount - idx;
1776 return err;
1777#endif
1778}
1779EXPORT_SYMBOL(vm_insert_pages);
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1811 struct page *page)
1812{
1813 if (addr < vma->vm_start || addr >= vma->vm_end)
1814 return -EFAULT;
1815 if (!page_count(page))
1816 return -EINVAL;
1817 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1818 BUG_ON(mmap_read_trylock(vma->vm_mm));
1819 BUG_ON(vma->vm_flags & VM_PFNMAP);
1820 vma->vm_flags |= VM_MIXEDMAP;
1821 }
1822 return insert_page(vma, addr, page, vma->vm_page_prot);
1823}
1824EXPORT_SYMBOL(vm_insert_page);
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1838 unsigned long num, unsigned long offset)
1839{
1840 unsigned long count = vma_pages(vma);
1841 unsigned long uaddr = vma->vm_start;
1842 int ret, i;
1843
1844
1845 if (offset >= num)
1846 return -ENXIO;
1847
1848
1849 if (count > num - offset)
1850 return -ENXIO;
1851
1852 for (i = 0; i < count; i++) {
1853 ret = vm_insert_page(vma, uaddr, pages[offset + i]);
1854 if (ret < 0)
1855 return ret;
1856 uaddr += PAGE_SIZE;
1857 }
1858
1859 return 0;
1860}
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1881 unsigned long num)
1882{
1883 return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
1884}
1885EXPORT_SYMBOL(vm_map_pages);
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
1901 unsigned long num)
1902{
1903 return __vm_map_pages(vma, pages, num, 0);
1904}
1905EXPORT_SYMBOL(vm_map_pages_zero);
1906
1907static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1908 pfn_t pfn, pgprot_t prot, bool mkwrite)
1909{
1910 struct mm_struct *mm = vma->vm_mm;
1911 pte_t *pte, entry;
1912 spinlock_t *ptl;
1913
1914 pte = get_locked_pte(mm, addr, &ptl);
1915 if (!pte)
1916 return VM_FAULT_OOM;
1917 if (!pte_none(*pte)) {
1918 if (mkwrite) {
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929 if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
1930 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
1931 goto out_unlock;
1932 }
1933 entry = pte_mkyoung(*pte);
1934 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1935 if (ptep_set_access_flags(vma, addr, pte, entry, 1))
1936 update_mmu_cache(vma, addr, pte);
1937 }
1938 goto out_unlock;
1939 }
1940
1941
1942 if (pfn_t_devmap(pfn))
1943 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1944 else
1945 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1946
1947 if (mkwrite) {
1948 entry = pte_mkyoung(entry);
1949 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1950 }
1951
1952 set_pte_at(mm, addr, pte, entry);
1953 update_mmu_cache(vma, addr, pte);
1954
1955out_unlock:
1956 pte_unmap_unlock(pte, ptl);
1957 return VM_FAULT_NOPAGE;
1958}
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1982 unsigned long pfn, pgprot_t pgprot)
1983{
1984
1985
1986
1987
1988
1989
1990 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1991 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1992 (VM_PFNMAP|VM_MIXEDMAP));
1993 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1994 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1995
1996 if (addr < vma->vm_start || addr >= vma->vm_end)
1997 return VM_FAULT_SIGBUS;
1998
1999 if (!pfn_modify_allowed(pfn, pgprot))
2000 return VM_FAULT_SIGBUS;
2001
2002 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
2003
2004 return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
2005 false);
2006}
2007EXPORT_SYMBOL(vmf_insert_pfn_prot);
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2030 unsigned long pfn)
2031{
2032 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
2033}
2034EXPORT_SYMBOL(vmf_insert_pfn);
2035
2036static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
2037{
2038
2039 if (vma->vm_flags & VM_MIXEDMAP)
2040 return true;
2041 if (pfn_t_devmap(pfn))
2042 return true;
2043 if (pfn_t_special(pfn))
2044 return true;
2045 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
2046 return true;
2047 return false;
2048}
2049
2050static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
2051 unsigned long addr, pfn_t pfn, pgprot_t pgprot,
2052 bool mkwrite)
2053{
2054 int err;
2055
2056 BUG_ON(!vm_mixed_ok(vma, pfn));
2057
2058 if (addr < vma->vm_start || addr >= vma->vm_end)
2059 return VM_FAULT_SIGBUS;
2060
2061 track_pfn_insert(vma, &pgprot, pfn);
2062
2063 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
2064 return VM_FAULT_SIGBUS;
2065
2066
2067
2068
2069
2070
2071
2072
2073 if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
2074 !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
2075 struct page *page;
2076
2077
2078
2079
2080
2081
2082 page = pfn_to_page(pfn_t_to_pfn(pfn));
2083 err = insert_page(vma, addr, page, pgprot);
2084 } else {
2085 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
2086 }
2087
2088 if (err == -ENOMEM)
2089 return VM_FAULT_OOM;
2090 if (err < 0 && err != -EBUSY)
2091 return VM_FAULT_SIGBUS;
2092
2093 return VM_FAULT_NOPAGE;
2094}
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
2123 pfn_t pfn, pgprot_t pgprot)
2124{
2125 return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
2126}
2127EXPORT_SYMBOL(vmf_insert_mixed_prot);
2128
2129vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2130 pfn_t pfn)
2131{
2132 return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
2133}
2134EXPORT_SYMBOL(vmf_insert_mixed);
2135
2136
2137
2138
2139
2140
2141vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2142 unsigned long addr, pfn_t pfn)
2143{
2144 return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
2145}
2146EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
2147
2148
2149
2150
2151
2152
2153static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2154 unsigned long addr, unsigned long end,
2155 unsigned long pfn, pgprot_t prot)
2156{
2157 pte_t *pte;
2158 spinlock_t *ptl;
2159 int err = 0;
2160
2161 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2162 if (!pte)
2163 return -ENOMEM;
2164 arch_enter_lazy_mmu_mode();
2165 do {
2166 BUG_ON(!pte_none(*pte));
2167 if (!pfn_modify_allowed(pfn, prot)) {
2168 err = -EACCES;
2169 break;
2170 }
2171 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2172 pfn++;
2173 } while (pte++, addr += PAGE_SIZE, addr != end);
2174 arch_leave_lazy_mmu_mode();
2175 pte_unmap_unlock(pte - 1, ptl);
2176 return err;
2177}
2178
2179static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2180 unsigned long addr, unsigned long end,
2181 unsigned long pfn, pgprot_t prot)
2182{
2183 pmd_t *pmd;
2184 unsigned long next;
2185 int err;
2186
2187 pfn -= addr >> PAGE_SHIFT;
2188 pmd = pmd_alloc(mm, pud, addr);
2189 if (!pmd)
2190 return -ENOMEM;
2191 VM_BUG_ON(pmd_trans_huge(*pmd));
2192 do {
2193 next = pmd_addr_end(addr, end);
2194 err = remap_pte_range(mm, pmd, addr, next,
2195 pfn + (addr >> PAGE_SHIFT), prot);
2196 if (err)
2197 return err;
2198 } while (pmd++, addr = next, addr != end);
2199 return 0;
2200}
2201
2202static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
2203 unsigned long addr, unsigned long end,
2204 unsigned long pfn, pgprot_t prot)
2205{
2206 pud_t *pud;
2207 unsigned long next;
2208 int err;
2209
2210 pfn -= addr >> PAGE_SHIFT;
2211 pud = pud_alloc(mm, p4d, addr);
2212 if (!pud)
2213 return -ENOMEM;
2214 do {
2215 next = pud_addr_end(addr, end);
2216 err = remap_pmd_range(mm, pud, addr, next,
2217 pfn + (addr >> PAGE_SHIFT), prot);
2218 if (err)
2219 return err;
2220 } while (pud++, addr = next, addr != end);
2221 return 0;
2222}
2223
2224static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2225 unsigned long addr, unsigned long end,
2226 unsigned long pfn, pgprot_t prot)
2227{
2228 p4d_t *p4d;
2229 unsigned long next;
2230 int err;
2231
2232 pfn -= addr >> PAGE_SHIFT;
2233 p4d = p4d_alloc(mm, pgd, addr);
2234 if (!p4d)
2235 return -ENOMEM;
2236 do {
2237 next = p4d_addr_end(addr, end);
2238 err = remap_pud_range(mm, p4d, addr, next,
2239 pfn + (addr >> PAGE_SHIFT), prot);
2240 if (err)
2241 return err;
2242 } while (p4d++, addr = next, addr != end);
2243 return 0;
2244}
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2259 unsigned long pfn, unsigned long size, pgprot_t prot)
2260{
2261 pgd_t *pgd;
2262 unsigned long next;
2263 unsigned long end = addr + PAGE_ALIGN(size);
2264 struct mm_struct *mm = vma->vm_mm;
2265 unsigned long remap_pfn = pfn;
2266 int err;
2267
2268 if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
2269 return -EINVAL;
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289 if (is_cow_mapping(vma->vm_flags)) {
2290 if (addr != vma->vm_start || end != vma->vm_end)
2291 return -EINVAL;
2292 vma->vm_pgoff = pfn;
2293 }
2294
2295 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
2296 if (err)
2297 return -EINVAL;
2298
2299 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2300
2301 BUG_ON(addr >= end);
2302 pfn -= addr >> PAGE_SHIFT;
2303 pgd = pgd_offset(mm, addr);
2304 flush_cache_range(vma, addr, end);
2305 do {
2306 next = pgd_addr_end(addr, end);
2307 err = remap_p4d_range(mm, pgd, addr, next,
2308 pfn + (addr >> PAGE_SHIFT), prot);
2309 if (err)
2310 break;
2311 } while (pgd++, addr = next, addr != end);
2312
2313 if (err)
2314 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
2315
2316 return err;
2317}
2318EXPORT_SYMBOL(remap_pfn_range);
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2336{
2337 unsigned long vm_len, pfn, pages;
2338
2339
2340 if (start + len < start)
2341 return -EINVAL;
2342
2343
2344
2345
2346
2347 len += start & ~PAGE_MASK;
2348 pfn = start >> PAGE_SHIFT;
2349 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2350 if (pfn + pages < pfn)
2351 return -EINVAL;
2352
2353
2354 if (vma->vm_pgoff > pages)
2355 return -EINVAL;
2356 pfn += vma->vm_pgoff;
2357 pages -= vma->vm_pgoff;
2358
2359
2360 vm_len = vma->vm_end - vma->vm_start;
2361 if (vm_len >> PAGE_SHIFT > pages)
2362 return -EINVAL;
2363
2364
2365 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2366}
2367EXPORT_SYMBOL(vm_iomap_memory);
2368
2369static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2370 unsigned long addr, unsigned long end,
2371 pte_fn_t fn, void *data, bool create,
2372 pgtbl_mod_mask *mask)
2373{
2374 pte_t *pte;
2375 int err = 0;
2376 spinlock_t *ptl;
2377
2378 if (create) {
2379 pte = (mm == &init_mm) ?
2380 pte_alloc_kernel_track(pmd, addr, mask) :
2381 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2382 if (!pte)
2383 return -ENOMEM;
2384 } else {
2385 pte = (mm == &init_mm) ?
2386 pte_offset_kernel(pmd, addr) :
2387 pte_offset_map_lock(mm, pmd, addr, &ptl);
2388 }
2389
2390 BUG_ON(pmd_huge(*pmd));
2391
2392 arch_enter_lazy_mmu_mode();
2393
2394 if (fn) {
2395 do {
2396 if (create || !pte_none(*pte)) {
2397 err = fn(pte++, addr, data);
2398 if (err)
2399 break;
2400 }
2401 } while (addr += PAGE_SIZE, addr != end);
2402 }
2403 *mask |= PGTBL_PTE_MODIFIED;
2404
2405 arch_leave_lazy_mmu_mode();
2406
2407 if (mm != &init_mm)
2408 pte_unmap_unlock(pte-1, ptl);
2409 return err;
2410}
2411
2412static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2413 unsigned long addr, unsigned long end,
2414 pte_fn_t fn, void *data, bool create,
2415 pgtbl_mod_mask *mask)
2416{
2417 pmd_t *pmd;
2418 unsigned long next;
2419 int err = 0;
2420
2421 BUG_ON(pud_huge(*pud));
2422
2423 if (create) {
2424 pmd = pmd_alloc_track(mm, pud, addr, mask);
2425 if (!pmd)
2426 return -ENOMEM;
2427 } else {
2428 pmd = pmd_offset(pud, addr);
2429 }
2430 do {
2431 next = pmd_addr_end(addr, end);
2432 if (create || !pmd_none_or_clear_bad(pmd)) {
2433 err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
2434 create, mask);
2435 if (err)
2436 break;
2437 }
2438 } while (pmd++, addr = next, addr != end);
2439 return err;
2440}
2441
2442static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2443 unsigned long addr, unsigned long end,
2444 pte_fn_t fn, void *data, bool create,
2445 pgtbl_mod_mask *mask)
2446{
2447 pud_t *pud;
2448 unsigned long next;
2449 int err = 0;
2450
2451 if (create) {
2452 pud = pud_alloc_track(mm, p4d, addr, mask);
2453 if (!pud)
2454 return -ENOMEM;
2455 } else {
2456 pud = pud_offset(p4d, addr);
2457 }
2458 do {
2459 next = pud_addr_end(addr, end);
2460 if (create || !pud_none_or_clear_bad(pud)) {
2461 err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
2462 create, mask);
2463 if (err)
2464 break;
2465 }
2466 } while (pud++, addr = next, addr != end);
2467 return err;
2468}
2469
2470static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2471 unsigned long addr, unsigned long end,
2472 pte_fn_t fn, void *data, bool create,
2473 pgtbl_mod_mask *mask)
2474{
2475 p4d_t *p4d;
2476 unsigned long next;
2477 int err = 0;
2478
2479 if (create) {
2480 p4d = p4d_alloc_track(mm, pgd, addr, mask);
2481 if (!p4d)
2482 return -ENOMEM;
2483 } else {
2484 p4d = p4d_offset(pgd, addr);
2485 }
2486 do {
2487 next = p4d_addr_end(addr, end);
2488 if (create || !p4d_none_or_clear_bad(p4d)) {
2489 err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
2490 create, mask);
2491 if (err)
2492 break;
2493 }
2494 } while (p4d++, addr = next, addr != end);
2495 return err;
2496}
2497
2498static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2499 unsigned long size, pte_fn_t fn,
2500 void *data, bool create)
2501{
2502 pgd_t *pgd;
2503 unsigned long start = addr, next;
2504 unsigned long end = addr + size;
2505 pgtbl_mod_mask mask = 0;
2506 int err = 0;
2507
2508 if (WARN_ON(addr >= end))
2509 return -EINVAL;
2510
2511 pgd = pgd_offset(mm, addr);
2512 do {
2513 next = pgd_addr_end(addr, end);
2514 if (!create && pgd_none_or_clear_bad(pgd))
2515 continue;
2516 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
2517 if (err)
2518 break;
2519 } while (pgd++, addr = next, addr != end);
2520
2521 if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
2522 arch_sync_kernel_mappings(start, start + size);
2523
2524 return err;
2525}
2526
2527
2528
2529
2530
2531int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2532 unsigned long size, pte_fn_t fn, void *data)
2533{
2534 return __apply_to_page_range(mm, addr, size, fn, data, true);
2535}
2536EXPORT_SYMBOL_GPL(apply_to_page_range);
2537
2538
2539
2540
2541
2542
2543
2544
2545int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
2546 unsigned long size, pte_fn_t fn, void *data)
2547{
2548 return __apply_to_page_range(mm, addr, size, fn, data, false);
2549}
2550EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2561 pte_t *page_table, pte_t orig_pte)
2562{
2563 int same = 1;
2564#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
2565 if (sizeof(pte_t) > sizeof(unsigned long)) {
2566 spinlock_t *ptl = pte_lockptr(mm, pmd);
2567 spin_lock(ptl);
2568 same = pte_same(*page_table, orig_pte);
2569 spin_unlock(ptl);
2570 }
2571#endif
2572 pte_unmap(page_table);
2573 return same;
2574}
2575
2576static inline bool cow_user_page(struct page *dst, struct page *src,
2577 struct vm_fault *vmf)
2578{
2579 bool ret;
2580 void *kaddr;
2581 void __user *uaddr;
2582 bool locked = false;
2583 struct vm_area_struct *vma = vmf->vma;
2584 struct mm_struct *mm = vma->vm_mm;
2585 unsigned long addr = vmf->address;
2586
2587 if (likely(src)) {
2588 copy_user_highpage(dst, src, addr, vma);
2589 return true;
2590 }
2591
2592
2593
2594
2595
2596
2597
2598 kaddr = kmap_atomic(dst);
2599 uaddr = (void __user *)(addr & PAGE_MASK);
2600
2601
2602
2603
2604
2605 if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
2606 pte_t entry;
2607
2608 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2609 locked = true;
2610 if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2611
2612
2613
2614
2615 update_mmu_tlb(vma, addr, vmf->pte);
2616 ret = false;
2617 goto pte_unlock;
2618 }
2619
2620 entry = pte_mkyoung(vmf->orig_pte);
2621 if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
2622 update_mmu_cache(vma, addr, vmf->pte);
2623 }
2624
2625
2626
2627
2628
2629
2630
2631 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
2632 if (locked)
2633 goto warn;
2634
2635
2636 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2637 locked = true;
2638 if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2639
2640 update_mmu_tlb(vma, addr, vmf->pte);
2641 ret = false;
2642 goto pte_unlock;
2643 }
2644
2645
2646
2647
2648
2649 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
2650
2651
2652
2653
2654warn:
2655 WARN_ON_ONCE(1);
2656 clear_page(kaddr);
2657 }
2658 }
2659
2660 ret = true;
2661
2662pte_unlock:
2663 if (locked)
2664 pte_unmap_unlock(vmf->pte, vmf->ptl);
2665 kunmap_atomic(kaddr);
2666 flush_dcache_page(dst);
2667
2668 return ret;
2669}
2670
2671static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2672{
2673 struct file *vm_file = vma->vm_file;
2674
2675 if (vm_file)
2676 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2677
2678
2679
2680
2681
2682 return GFP_KERNEL;
2683}
2684
2685
2686
2687
2688
2689
2690
2691static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2692{
2693 vm_fault_t ret;
2694 struct page *page = vmf->page;
2695 unsigned int old_flags = vmf->flags;
2696
2697 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2698
2699 if (vmf->vma->vm_file &&
2700 IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
2701 return VM_FAULT_SIGBUS;
2702
2703 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2704
2705 vmf->flags = old_flags;
2706 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2707 return ret;
2708 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2709 lock_page(page);
2710 if (!page->mapping) {
2711 unlock_page(page);
2712 return 0;
2713 }
2714 ret |= VM_FAULT_LOCKED;
2715 } else
2716 VM_BUG_ON_PAGE(!PageLocked(page), page);
2717 return ret;
2718}
2719
2720
2721
2722
2723
2724
2725static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
2726{
2727 struct vm_area_struct *vma = vmf->vma;
2728 struct address_space *mapping;
2729 struct page *page = vmf->page;
2730 bool dirtied;
2731 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2732
2733 dirtied = set_page_dirty(page);
2734 VM_BUG_ON_PAGE(PageAnon(page), page);
2735
2736
2737
2738
2739
2740
2741 mapping = page_rmapping(page);
2742 unlock_page(page);
2743
2744 if (!page_mkwrite)
2745 file_update_time(vma->vm_file);
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756 if ((dirtied || page_mkwrite) && mapping) {
2757 struct file *fpin;
2758
2759 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2760 balance_dirty_pages_ratelimited(mapping);
2761 if (fpin) {
2762 fput(fpin);
2763 return VM_FAULT_RETRY;
2764 }
2765 }
2766
2767 return 0;
2768}
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778static inline void wp_page_reuse(struct vm_fault *vmf)
2779 __releases(vmf->ptl)
2780{
2781 struct vm_area_struct *vma = vmf->vma;
2782 struct page *page = vmf->page;
2783 pte_t entry;
2784
2785
2786
2787
2788
2789 if (page)
2790 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2791
2792 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2793 entry = pte_mkyoung(vmf->orig_pte);
2794 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2795 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2796 update_mmu_cache(vma, vmf->address, vmf->pte);
2797 pte_unmap_unlock(vmf->pte, vmf->ptl);
2798 count_vm_event(PGREUSE);
2799}
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2818{
2819 struct vm_area_struct *vma = vmf->vma;
2820 struct mm_struct *mm = vma->vm_mm;
2821 struct page *old_page = vmf->page;
2822 struct page *new_page = NULL;
2823 pte_t entry;
2824 int page_copied = 0;
2825 struct mmu_notifier_range range;
2826
2827 if (unlikely(anon_vma_prepare(vma)))
2828 goto oom;
2829
2830 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2831 new_page = alloc_zeroed_user_highpage_movable(vma,
2832 vmf->address);
2833 if (!new_page)
2834 goto oom;
2835 } else {
2836 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2837 vmf->address);
2838 if (!new_page)
2839 goto oom;
2840
2841 if (!cow_user_page(new_page, old_page, vmf)) {
2842
2843
2844
2845
2846
2847
2848 put_page(new_page);
2849 if (old_page)
2850 put_page(old_page);
2851 return 0;
2852 }
2853 }
2854
2855 if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
2856 goto oom_free_new;
2857 cgroup_throttle_swaprate(new_page, GFP_KERNEL);
2858
2859 __SetPageUptodate(new_page);
2860
2861 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
2862 vmf->address & PAGE_MASK,
2863 (vmf->address & PAGE_MASK) + PAGE_SIZE);
2864 mmu_notifier_invalidate_range_start(&range);
2865
2866
2867
2868
2869 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2870 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2871 if (old_page) {
2872 if (!PageAnon(old_page)) {
2873 dec_mm_counter_fast(mm,
2874 mm_counter_file(old_page));
2875 inc_mm_counter_fast(mm, MM_ANONPAGES);
2876 }
2877 } else {
2878 inc_mm_counter_fast(mm, MM_ANONPAGES);
2879 }
2880 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2881 entry = mk_pte(new_page, vma->vm_page_prot);
2882 entry = pte_sw_mkyoung(entry);
2883 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2884
2885
2886
2887
2888
2889
2890 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2891 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2892 lru_cache_add_inactive_or_unevictable(new_page, vma);
2893
2894
2895
2896
2897
2898 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2899 update_mmu_cache(vma, vmf->address, vmf->pte);
2900 if (old_page) {
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923 page_remove_rmap(old_page, false);
2924 }
2925
2926
2927 new_page = old_page;
2928 page_copied = 1;
2929 } else {
2930 update_mmu_tlb(vma, vmf->address, vmf->pte);
2931 }
2932
2933 if (new_page)
2934 put_page(new_page);
2935
2936 pte_unmap_unlock(vmf->pte, vmf->ptl);
2937
2938
2939
2940
2941 mmu_notifier_invalidate_range_only_end(&range);
2942 if (old_page) {
2943
2944
2945
2946
2947 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2948 lock_page(old_page);
2949 if (PageMlocked(old_page))
2950 munlock_vma_page(old_page);
2951 unlock_page(old_page);
2952 }
2953 put_page(old_page);
2954 }
2955 return page_copied ? VM_FAULT_WRITE : 0;
2956oom_free_new:
2957 put_page(new_page);
2958oom:
2959 if (old_page)
2960 put_page(old_page);
2961 return VM_FAULT_OOM;
2962}
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2981{
2982 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2983 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2984 &vmf->ptl);
2985
2986
2987
2988
2989 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2990 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
2991 pte_unmap_unlock(vmf->pte, vmf->ptl);
2992 return VM_FAULT_NOPAGE;
2993 }
2994 wp_page_reuse(vmf);
2995 return 0;
2996}
2997
2998
2999
3000
3001
3002static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
3003{
3004 struct vm_area_struct *vma = vmf->vma;
3005
3006 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
3007 vm_fault_t ret;
3008
3009 pte_unmap_unlock(vmf->pte, vmf->ptl);
3010 vmf->flags |= FAULT_FLAG_MKWRITE;
3011 ret = vma->vm_ops->pfn_mkwrite(vmf);
3012 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
3013 return ret;
3014 return finish_mkwrite_fault(vmf);
3015 }
3016 wp_page_reuse(vmf);
3017 return VM_FAULT_WRITE;
3018}
3019
3020static vm_fault_t wp_page_shared(struct vm_fault *vmf)
3021 __releases(vmf->ptl)
3022{
3023 struct vm_area_struct *vma = vmf->vma;
3024 vm_fault_t ret = VM_FAULT_WRITE;
3025
3026 get_page(vmf->page);
3027
3028 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
3029 vm_fault_t tmp;
3030
3031 pte_unmap_unlock(vmf->pte, vmf->ptl);
3032 tmp = do_page_mkwrite(vmf);
3033 if (unlikely(!tmp || (tmp &
3034 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3035 put_page(vmf->page);
3036 return tmp;
3037 }
3038 tmp = finish_mkwrite_fault(vmf);
3039 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3040 unlock_page(vmf->page);
3041 put_page(vmf->page);
3042 return tmp;
3043 }
3044 } else {
3045 wp_page_reuse(vmf);
3046 lock_page(vmf->page);
3047 }
3048 ret |= fault_dirty_shared_page(vmf);
3049 put_page(vmf->page);
3050
3051 return ret;
3052}
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072static vm_fault_t do_wp_page(struct vm_fault *vmf)
3073 __releases(vmf->ptl)
3074{
3075 struct vm_area_struct *vma = vmf->vma;
3076
3077 if (userfaultfd_pte_wp(vma, *vmf->pte)) {
3078 pte_unmap_unlock(vmf->pte, vmf->ptl);
3079 return handle_userfault(vmf, VM_UFFD_WP);
3080 }
3081
3082 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
3083 if (!vmf->page) {
3084
3085
3086
3087
3088
3089
3090
3091 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
3092 (VM_WRITE|VM_SHARED))
3093 return wp_pfn_shared(vmf);
3094
3095 pte_unmap_unlock(vmf->pte, vmf->ptl);
3096 return wp_page_copy(vmf);
3097 }
3098
3099
3100
3101
3102
3103 if (PageAnon(vmf->page)) {
3104 struct page *page = vmf->page;
3105
3106
3107 if (PageKsm(page) || page_count(page) != 1)
3108 goto copy;
3109 if (!trylock_page(page))
3110 goto copy;
3111 if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
3112 unlock_page(page);
3113 goto copy;
3114 }
3115
3116
3117
3118
3119
3120 unlock_page(page);
3121 wp_page_reuse(vmf);
3122 return VM_FAULT_WRITE;
3123 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
3124 (VM_WRITE|VM_SHARED))) {
3125 return wp_page_shared(vmf);
3126 }
3127copy:
3128
3129
3130
3131 get_page(vmf->page);
3132
3133 pte_unmap_unlock(vmf->pte, vmf->ptl);
3134 return wp_page_copy(vmf);
3135}
3136
3137static void unmap_mapping_range_vma(struct vm_area_struct *vma,
3138 unsigned long start_addr, unsigned long end_addr,
3139 struct zap_details *details)
3140{
3141 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
3142}
3143
3144static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
3145 struct zap_details *details)
3146{
3147 struct vm_area_struct *vma;
3148 pgoff_t vba, vea, zba, zea;
3149
3150 vma_interval_tree_foreach(vma, root,
3151 details->first_index, details->last_index) {
3152
3153 vba = vma->vm_pgoff;
3154 vea = vba + vma_pages(vma) - 1;
3155 zba = details->first_index;
3156 if (zba < vba)
3157 zba = vba;
3158 zea = details->last_index;
3159 if (zea > vea)
3160 zea = vea;
3161
3162 unmap_mapping_range_vma(vma,
3163 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
3164 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
3165 details);
3166 }
3167}
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
3182 pgoff_t nr, bool even_cows)
3183{
3184 struct zap_details details = { };
3185
3186 details.check_mapping = even_cows ? NULL : mapping;
3187 details.first_index = start;
3188 details.last_index = start + nr - 1;
3189 if (details.last_index < details.first_index)
3190 details.last_index = ULONG_MAX;
3191
3192 i_mmap_lock_write(mapping);
3193 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3194 unmap_mapping_range_tree(&mapping->i_mmap, &details);
3195 i_mmap_unlock_write(mapping);
3196}
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215void unmap_mapping_range(struct address_space *mapping,
3216 loff_t const holebegin, loff_t const holelen, int even_cows)
3217{
3218 pgoff_t hba = holebegin >> PAGE_SHIFT;
3219 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3220
3221
3222 if (sizeof(holelen) > sizeof(hlen)) {
3223 long long holeend =
3224 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3225 if (holeend & ~(long long)ULONG_MAX)
3226 hlen = ULONG_MAX - hba + 1;
3227 }
3228
3229 unmap_mapping_pages(mapping, hba, hlen, even_cows);
3230}
3231EXPORT_SYMBOL(unmap_mapping_range);
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241vm_fault_t do_swap_page(struct vm_fault *vmf)
3242{
3243 struct vm_area_struct *vma = vmf->vma;
3244 struct page *page = NULL, *swapcache;
3245 swp_entry_t entry;
3246 pte_t pte;
3247 int locked;
3248 int exclusive = 0;
3249 vm_fault_t ret = 0;
3250 void *shadow = NULL;
3251
3252 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
3253 goto out;
3254
3255 entry = pte_to_swp_entry(vmf->orig_pte);
3256 if (unlikely(non_swap_entry(entry))) {
3257 if (is_migration_entry(entry)) {
3258 migration_entry_wait(vma->vm_mm, vmf->pmd,
3259 vmf->address);
3260 } else if (is_device_private_entry(entry)) {
3261 vmf->page = device_private_entry_to_page(entry);
3262 ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
3263 } else if (is_hwpoison_entry(entry)) {
3264 ret = VM_FAULT_HWPOISON;
3265 } else {
3266 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
3267 ret = VM_FAULT_SIGBUS;
3268 }
3269 goto out;
3270 }
3271
3272
3273 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
3274 page = lookup_swap_cache(entry, vma, vmf->address);
3275 swapcache = page;
3276
3277 if (!page) {
3278 struct swap_info_struct *si = swp_swap_info(entry);
3279
3280 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
3281 __swap_count(entry) == 1) {
3282
3283 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
3284 vmf->address);
3285 if (page) {
3286 int err;
3287
3288 __SetPageLocked(page);
3289 __SetPageSwapBacked(page);
3290 set_page_private(page, entry.val);
3291
3292
3293 SetPageSwapCache(page);
3294 err = mem_cgroup_charge(page, vma->vm_mm,
3295 GFP_KERNEL);
3296 ClearPageSwapCache(page);
3297 if (err) {
3298 ret = VM_FAULT_OOM;
3299 goto out_page;
3300 }
3301
3302 shadow = get_shadow_from_swap_cache(entry);
3303 if (shadow)
3304 workingset_refault(page, shadow);
3305
3306 lru_cache_add(page);
3307 swap_readpage(page, true);
3308 }
3309 } else {
3310 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
3311 vmf);
3312 swapcache = page;
3313 }
3314
3315 if (!page) {
3316
3317
3318
3319
3320 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3321 vmf->address, &vmf->ptl);
3322 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3323 ret = VM_FAULT_OOM;
3324 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3325 goto unlock;
3326 }
3327
3328
3329 ret = VM_FAULT_MAJOR;
3330 count_vm_event(PGMAJFAULT);
3331 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
3332 } else if (PageHWPoison(page)) {
3333
3334
3335
3336
3337 ret = VM_FAULT_HWPOISON;
3338 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3339 goto out_release;
3340 }
3341
3342 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
3343
3344 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3345 if (!locked) {
3346 ret |= VM_FAULT_RETRY;
3347 goto out_release;
3348 }
3349
3350
3351
3352
3353
3354
3355
3356 if (unlikely((!PageSwapCache(page) ||
3357 page_private(page) != entry.val)) && swapcache)
3358 goto out_page;
3359
3360 page = ksm_might_need_to_copy(page, vma, vmf->address);
3361 if (unlikely(!page)) {
3362 ret = VM_FAULT_OOM;
3363 page = swapcache;
3364 goto out_page;
3365 }
3366
3367 cgroup_throttle_swaprate(page, GFP_KERNEL);
3368
3369
3370
3371
3372 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3373 &vmf->ptl);
3374 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3375 goto out_nomap;
3376
3377 if (unlikely(!PageUptodate(page))) {
3378 ret = VM_FAULT_SIGBUS;
3379 goto out_nomap;
3380 }
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3393 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3394 pte = mk_pte(page, vma->vm_page_prot);
3395 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
3396 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3397 vmf->flags &= ~FAULT_FLAG_WRITE;
3398 ret |= VM_FAULT_WRITE;
3399 exclusive = RMAP_EXCLUSIVE;
3400 }
3401 flush_icache_page(vma, page);
3402 if (pte_swp_soft_dirty(vmf->orig_pte))
3403 pte = pte_mksoft_dirty(pte);
3404 if (pte_swp_uffd_wp(vmf->orig_pte)) {
3405 pte = pte_mkuffd_wp(pte);
3406 pte = pte_wrprotect(pte);
3407 }
3408 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3409 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
3410 vmf->orig_pte = pte;
3411
3412
3413 if (unlikely(page != swapcache && swapcache)) {
3414 page_add_new_anon_rmap(page, vma, vmf->address, false);
3415 lru_cache_add_inactive_or_unevictable(page, vma);
3416 } else {
3417 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3418 }
3419
3420 swap_free(entry);
3421 if (mem_cgroup_swap_full(page) ||
3422 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3423 try_to_free_swap(page);
3424 unlock_page(page);
3425 if (page != swapcache && swapcache) {
3426
3427
3428
3429
3430
3431
3432
3433
3434 unlock_page(swapcache);
3435 put_page(swapcache);
3436 }
3437
3438 if (vmf->flags & FAULT_FLAG_WRITE) {
3439 ret |= do_wp_page(vmf);
3440 if (ret & VM_FAULT_ERROR)
3441 ret &= VM_FAULT_ERROR;
3442 goto out;
3443 }
3444
3445
3446 update_mmu_cache(vma, vmf->address, vmf->pte);
3447unlock:
3448 pte_unmap_unlock(vmf->pte, vmf->ptl);
3449out:
3450 return ret;
3451out_nomap:
3452 pte_unmap_unlock(vmf->pte, vmf->ptl);
3453out_page:
3454 unlock_page(page);
3455out_release:
3456 put_page(page);
3457 if (page != swapcache && swapcache) {
3458 unlock_page(swapcache);
3459 put_page(swapcache);
3460 }
3461 return ret;
3462}
3463
3464
3465
3466
3467
3468
3469static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
3470{
3471 struct vm_area_struct *vma = vmf->vma;
3472 struct page *page;
3473 vm_fault_t ret = 0;
3474 pte_t entry;
3475
3476
3477 if (vma->vm_flags & VM_SHARED)
3478 return VM_FAULT_SIGBUS;
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490 if (pte_alloc(vma->vm_mm, vmf->pmd))
3491 return VM_FAULT_OOM;
3492
3493
3494 if (unlikely(pmd_trans_unstable(vmf->pmd)))
3495 return 0;
3496
3497
3498 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3499 !mm_forbids_zeropage(vma->vm_mm)) {
3500 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3501 vma->vm_page_prot));
3502 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3503 vmf->address, &vmf->ptl);
3504 if (!pte_none(*vmf->pte)) {
3505 update_mmu_tlb(vma, vmf->address, vmf->pte);
3506 goto unlock;
3507 }
3508 ret = check_stable_address_space(vma->vm_mm);
3509 if (ret)
3510 goto unlock;
3511
3512 if (userfaultfd_missing(vma)) {
3513 pte_unmap_unlock(vmf->pte, vmf->ptl);
3514 return handle_userfault(vmf, VM_UFFD_MISSING);
3515 }
3516 goto setpte;
3517 }
3518
3519
3520 if (unlikely(anon_vma_prepare(vma)))
3521 goto oom;
3522 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
3523 if (!page)
3524 goto oom;
3525
3526 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
3527 goto oom_free_page;
3528 cgroup_throttle_swaprate(page, GFP_KERNEL);
3529
3530
3531
3532
3533
3534
3535 __SetPageUptodate(page);
3536
3537 entry = mk_pte(page, vma->vm_page_prot);
3538 entry = pte_sw_mkyoung(entry);
3539 if (vma->vm_flags & VM_WRITE)
3540 entry = pte_mkwrite(pte_mkdirty(entry));
3541
3542 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3543 &vmf->ptl);
3544 if (!pte_none(*vmf->pte)) {
3545 update_mmu_cache(vma, vmf->address, vmf->pte);
3546 goto release;
3547 }
3548
3549 ret = check_stable_address_space(vma->vm_mm);
3550 if (ret)
3551 goto release;
3552
3553
3554 if (userfaultfd_missing(vma)) {
3555 pte_unmap_unlock(vmf->pte, vmf->ptl);
3556 put_page(page);
3557 return handle_userfault(vmf, VM_UFFD_MISSING);
3558 }
3559
3560 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3561 page_add_new_anon_rmap(page, vma, vmf->address, false);
3562 lru_cache_add_inactive_or_unevictable(page, vma);
3563setpte:
3564 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3565
3566
3567 update_mmu_cache(vma, vmf->address, vmf->pte);
3568unlock:
3569 pte_unmap_unlock(vmf->pte, vmf->ptl);
3570 return ret;
3571release:
3572 put_page(page);
3573 goto unlock;
3574oom_free_page:
3575 put_page(page);
3576oom:
3577 return VM_FAULT_OOM;
3578}
3579
3580
3581
3582
3583
3584
3585static vm_fault_t __do_fault(struct vm_fault *vmf)
3586{
3587 struct vm_area_struct *vma = vmf->vma;
3588 vm_fault_t ret;
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3606 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3607 if (!vmf->prealloc_pte)
3608 return VM_FAULT_OOM;
3609 smp_wmb();
3610 }
3611
3612 ret = vma->vm_ops->fault(vmf);
3613 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3614 VM_FAULT_DONE_COW)))
3615 return ret;
3616
3617 if (unlikely(PageHWPoison(vmf->page))) {
3618 if (ret & VM_FAULT_LOCKED)
3619 unlock_page(vmf->page);
3620 put_page(vmf->page);
3621 vmf->page = NULL;
3622 return VM_FAULT_HWPOISON;
3623 }
3624
3625 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3626 lock_page(vmf->page);
3627 else
3628 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3629
3630 return ret;
3631}
3632
3633
3634
3635
3636
3637
3638
3639static int pmd_devmap_trans_unstable(pmd_t *pmd)
3640{
3641 return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
3642}
3643
3644static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
3645{
3646 struct vm_area_struct *vma = vmf->vma;
3647
3648 if (!pmd_none(*vmf->pmd))
3649 goto map_pte;
3650 if (vmf->prealloc_pte) {
3651 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3652 if (unlikely(!pmd_none(*vmf->pmd))) {
3653 spin_unlock(vmf->ptl);
3654 goto map_pte;
3655 }
3656
3657 mm_inc_nr_ptes(vma->vm_mm);
3658 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3659 spin_unlock(vmf->ptl);
3660 vmf->prealloc_pte = NULL;
3661 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
3662 return VM_FAULT_OOM;
3663 }
3664map_pte:
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676 if (pmd_devmap_trans_unstable(vmf->pmd))
3677 return VM_FAULT_NOPAGE;
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3689 &vmf->ptl);
3690 return 0;
3691}
3692
3693#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3694static void deposit_prealloc_pte(struct vm_fault *vmf)
3695{
3696 struct vm_area_struct *vma = vmf->vma;
3697
3698 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3699
3700
3701
3702
3703 mm_inc_nr_ptes(vma->vm_mm);
3704 vmf->prealloc_pte = NULL;
3705}
3706
3707static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3708{
3709 struct vm_area_struct *vma = vmf->vma;
3710 bool write = vmf->flags & FAULT_FLAG_WRITE;
3711 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3712 pmd_t entry;
3713 int i;
3714 vm_fault_t ret = VM_FAULT_FALLBACK;
3715
3716 if (!transhuge_vma_suitable(vma, haddr))
3717 return ret;
3718
3719 page = compound_head(page);
3720 if (compound_order(page) != HPAGE_PMD_ORDER)
3721 return ret;
3722
3723
3724
3725
3726
3727 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3728 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3729 if (!vmf->prealloc_pte)
3730 return VM_FAULT_OOM;
3731 smp_wmb();
3732 }
3733
3734 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3735 if (unlikely(!pmd_none(*vmf->pmd)))
3736 goto out;
3737
3738 for (i = 0; i < HPAGE_PMD_NR; i++)
3739 flush_icache_page(vma, page + i);
3740
3741 entry = mk_huge_pmd(page, vma->vm_page_prot);
3742 if (write)
3743 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3744
3745 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
3746 page_add_file_rmap(page, true);
3747
3748
3749
3750 if (arch_needs_pgtable_deposit())
3751 deposit_prealloc_pte(vmf);
3752
3753 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3754
3755 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3756
3757
3758 ret = 0;
3759 count_vm_event(THP_FILE_MAPPED);
3760out:
3761 spin_unlock(vmf->ptl);
3762 return ret;
3763}
3764#else
3765static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3766{
3767 BUILD_BUG();
3768 return 0;
3769}
3770#endif
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
3788{
3789 struct vm_area_struct *vma = vmf->vma;
3790 bool write = vmf->flags & FAULT_FLAG_WRITE;
3791 pte_t entry;
3792 vm_fault_t ret;
3793
3794 if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
3795 ret = do_set_pmd(vmf, page);
3796 if (ret != VM_FAULT_FALLBACK)
3797 return ret;
3798 }
3799
3800 if (!vmf->pte) {
3801 ret = pte_alloc_one_map(vmf);
3802 if (ret)
3803 return ret;
3804 }
3805
3806
3807 if (unlikely(!pte_none(*vmf->pte))) {
3808 update_mmu_tlb(vma, vmf->address, vmf->pte);
3809 return VM_FAULT_NOPAGE;
3810 }
3811
3812 flush_icache_page(vma, page);
3813 entry = mk_pte(page, vma->vm_page_prot);
3814 entry = pte_sw_mkyoung(entry);
3815 if (write)
3816 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3817
3818 if (write && !(vma->vm_flags & VM_SHARED)) {
3819 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3820 page_add_new_anon_rmap(page, vma, vmf->address, false);
3821 lru_cache_add_inactive_or_unevictable(page, vma);
3822 } else {
3823 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3824 page_add_file_rmap(page, false);
3825 }
3826 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3827
3828
3829 update_mmu_cache(vma, vmf->address, vmf->pte);
3830
3831 return 0;
3832}
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850vm_fault_t finish_fault(struct vm_fault *vmf)
3851{
3852 struct page *page;
3853 vm_fault_t ret = 0;
3854
3855
3856 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3857 !(vmf->vma->vm_flags & VM_SHARED))
3858 page = vmf->cow_page;
3859 else
3860 page = vmf->page;
3861
3862
3863
3864
3865
3866 if (!(vmf->vma->vm_flags & VM_SHARED))
3867 ret = check_stable_address_space(vmf->vma->vm_mm);
3868 if (!ret)
3869 ret = alloc_set_pte(vmf, page);
3870 if (vmf->pte)
3871 pte_unmap_unlock(vmf->pte, vmf->ptl);
3872 return ret;
3873}
3874
3875static unsigned long fault_around_bytes __read_mostly =
3876 rounddown_pow_of_two(65536);
3877
3878#ifdef CONFIG_DEBUG_FS
3879static int fault_around_bytes_get(void *data, u64 *val)
3880{
3881 *val = fault_around_bytes;
3882 return 0;
3883}
3884
3885
3886
3887
3888
3889static int fault_around_bytes_set(void *data, u64 val)
3890{
3891 if (val / PAGE_SIZE > PTRS_PER_PTE)
3892 return -EINVAL;
3893 if (val > PAGE_SIZE)
3894 fault_around_bytes = rounddown_pow_of_two(val);
3895 else
3896 fault_around_bytes = PAGE_SIZE;
3897 return 0;
3898}
3899DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3900 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3901
3902static int __init fault_around_debugfs(void)
3903{
3904 debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3905 &fault_around_bytes_fops);
3906 return 0;
3907}
3908late_initcall(fault_around_debugfs);
3909#endif
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935static vm_fault_t do_fault_around(struct vm_fault *vmf)
3936{
3937 unsigned long address = vmf->address, nr_pages, mask;
3938 pgoff_t start_pgoff = vmf->pgoff;
3939 pgoff_t end_pgoff;
3940 int off;
3941 vm_fault_t ret = 0;
3942
3943 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3944 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3945
3946 vmf->address = max(address & mask, vmf->vma->vm_start);
3947 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3948 start_pgoff -= off;
3949
3950
3951
3952
3953
3954 end_pgoff = start_pgoff -
3955 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3956 PTRS_PER_PTE - 1;
3957 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3958 start_pgoff + nr_pages - 1);
3959
3960 if (pmd_none(*vmf->pmd)) {
3961 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3962 if (!vmf->prealloc_pte)
3963 goto out;
3964 smp_wmb();
3965 }
3966
3967 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3968
3969
3970 if (pmd_trans_huge(*vmf->pmd)) {
3971 ret = VM_FAULT_NOPAGE;
3972 goto out;
3973 }
3974
3975
3976 if (!vmf->pte)
3977 goto out;
3978
3979
3980 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3981 if (!pte_none(*vmf->pte))
3982 ret = VM_FAULT_NOPAGE;
3983 pte_unmap_unlock(vmf->pte, vmf->ptl);
3984out:
3985 vmf->address = address;
3986 vmf->pte = NULL;
3987 return ret;
3988}
3989
3990static vm_fault_t do_read_fault(struct vm_fault *vmf)
3991{
3992 struct vm_area_struct *vma = vmf->vma;
3993 vm_fault_t ret = 0;
3994
3995
3996
3997
3998
3999
4000 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
4001 ret = do_fault_around(vmf);
4002 if (ret)
4003 return ret;
4004 }
4005
4006 ret = __do_fault(vmf);
4007 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4008 return ret;
4009
4010 ret |= finish_fault(vmf);
4011 unlock_page(vmf->page);
4012 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4013 put_page(vmf->page);
4014 return ret;
4015}
4016
4017static vm_fault_t do_cow_fault(struct vm_fault *vmf)
4018{
4019 struct vm_area_struct *vma = vmf->vma;
4020 vm_fault_t ret;
4021
4022 if (unlikely(anon_vma_prepare(vma)))
4023 return VM_FAULT_OOM;
4024
4025 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
4026 if (!vmf->cow_page)
4027 return VM_FAULT_OOM;
4028
4029 if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
4030 put_page(vmf->cow_page);
4031 return VM_FAULT_OOM;
4032 }
4033 cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
4034
4035 ret = __do_fault(vmf);
4036 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4037 goto uncharge_out;
4038 if (ret & VM_FAULT_DONE_COW)
4039 return ret;
4040
4041 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
4042 __SetPageUptodate(vmf->cow_page);
4043
4044 ret |= finish_fault(vmf);
4045 unlock_page(vmf->page);
4046 put_page(vmf->page);
4047 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4048 goto uncharge_out;
4049 return ret;
4050uncharge_out:
4051 put_page(vmf->cow_page);
4052 return ret;
4053}
4054
4055static vm_fault_t do_shared_fault(struct vm_fault *vmf)
4056{
4057 struct vm_area_struct *vma = vmf->vma;
4058 vm_fault_t ret, tmp;
4059
4060 ret = __do_fault(vmf);
4061 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4062 return ret;
4063
4064
4065
4066
4067
4068 if (vma->vm_ops->page_mkwrite) {
4069 unlock_page(vmf->page);
4070 tmp = do_page_mkwrite(vmf);
4071 if (unlikely(!tmp ||
4072 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
4073 put_page(vmf->page);
4074 return tmp;
4075 }
4076 }
4077
4078 ret |= finish_fault(vmf);
4079 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
4080 VM_FAULT_RETRY))) {
4081 unlock_page(vmf->page);
4082 put_page(vmf->page);
4083 return ret;
4084 }
4085
4086 ret |= fault_dirty_shared_page(vmf);
4087 return ret;
4088}
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098static vm_fault_t do_fault(struct vm_fault *vmf)
4099{
4100 struct vm_area_struct *vma = vmf->vma;
4101 struct mm_struct *vm_mm = vma->vm_mm;
4102 vm_fault_t ret;
4103
4104
4105
4106
4107 if (!vma->vm_ops->fault) {
4108
4109
4110
4111
4112 if (unlikely(!pmd_present(*vmf->pmd)))
4113 ret = VM_FAULT_SIGBUS;
4114 else {
4115 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
4116 vmf->pmd,
4117 vmf->address,
4118 &vmf->ptl);
4119
4120
4121
4122
4123
4124
4125
4126 if (unlikely(pte_none(*vmf->pte)))
4127 ret = VM_FAULT_SIGBUS;
4128 else
4129 ret = VM_FAULT_NOPAGE;
4130
4131 pte_unmap_unlock(vmf->pte, vmf->ptl);
4132 }
4133 } else if (!(vmf->flags & FAULT_FLAG_WRITE))
4134 ret = do_read_fault(vmf);
4135 else if (!(vma->vm_flags & VM_SHARED))
4136 ret = do_cow_fault(vmf);
4137 else
4138 ret = do_shared_fault(vmf);
4139
4140
4141 if (vmf->prealloc_pte) {
4142 pte_free(vm_mm, vmf->prealloc_pte);
4143 vmf->prealloc_pte = NULL;
4144 }
4145 return ret;
4146}
4147
4148static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
4149 unsigned long addr, int page_nid,
4150 int *flags)
4151{
4152 get_page(page);
4153
4154 count_vm_numa_event(NUMA_HINT_FAULTS);
4155 if (page_nid == numa_node_id()) {
4156 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
4157 *flags |= TNF_FAULT_LOCAL;
4158 }
4159
4160 return mpol_misplaced(page, vma, addr);
4161}
4162
4163static vm_fault_t do_numa_page(struct vm_fault *vmf)
4164{
4165 struct vm_area_struct *vma = vmf->vma;
4166 struct page *page = NULL;
4167 int page_nid = NUMA_NO_NODE;
4168 int last_cpupid;
4169 int target_nid;
4170 bool migrated = false;
4171 pte_t pte, old_pte;
4172 bool was_writable = pte_savedwrite(vmf->orig_pte);
4173 int flags = 0;
4174
4175
4176
4177
4178
4179
4180 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
4181 spin_lock(vmf->ptl);
4182 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4183 pte_unmap_unlock(vmf->pte, vmf->ptl);
4184 goto out;
4185 }
4186
4187
4188
4189
4190
4191 old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
4192 pte = pte_modify(old_pte, vma->vm_page_prot);
4193 pte = pte_mkyoung(pte);
4194 if (was_writable)
4195 pte = pte_mkwrite(pte);
4196 ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
4197 update_mmu_cache(vma, vmf->address, vmf->pte);
4198
4199 page = vm_normal_page(vma, vmf->address, pte);
4200 if (!page) {
4201 pte_unmap_unlock(vmf->pte, vmf->ptl);
4202 return 0;
4203 }
4204
4205
4206 if (PageCompound(page)) {
4207 pte_unmap_unlock(vmf->pte, vmf->ptl);
4208 return 0;
4209 }
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219 if (!pte_write(pte))
4220 flags |= TNF_NO_GROUP;
4221
4222
4223
4224
4225
4226 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
4227 flags |= TNF_SHARED;
4228
4229 last_cpupid = page_cpupid_last(page);
4230 page_nid = page_to_nid(page);
4231 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
4232 &flags);
4233 pte_unmap_unlock(vmf->pte, vmf->ptl);
4234 if (target_nid == NUMA_NO_NODE) {
4235 put_page(page);
4236 goto out;
4237 }
4238
4239
4240 migrated = migrate_misplaced_page(page, vma, target_nid);
4241 if (migrated) {
4242 page_nid = target_nid;
4243 flags |= TNF_MIGRATED;
4244 } else
4245 flags |= TNF_MIGRATE_FAIL;
4246
4247out:
4248 if (page_nid != NUMA_NO_NODE)
4249 task_numa_fault(last_cpupid, page_nid, 1, flags);
4250 return 0;
4251}
4252
4253static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
4254{
4255 if (vma_is_anonymous(vmf->vma))
4256 return do_huge_pmd_anonymous_page(vmf);
4257 if (vmf->vma->vm_ops->huge_fault)
4258 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4259 return VM_FAULT_FALLBACK;
4260}
4261
4262
4263static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
4264{
4265 if (vma_is_anonymous(vmf->vma)) {
4266 if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
4267 return handle_userfault(vmf, VM_UFFD_WP);
4268 return do_huge_pmd_wp_page(vmf, orig_pmd);
4269 }
4270 if (vmf->vma->vm_ops->huge_fault) {
4271 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4272
4273 if (!(ret & VM_FAULT_FALLBACK))
4274 return ret;
4275 }
4276
4277
4278 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
4279
4280 return VM_FAULT_FALLBACK;
4281}
4282
4283static vm_fault_t create_huge_pud(struct vm_fault *vmf)
4284{
4285#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
4286 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4287
4288 if (vma_is_anonymous(vmf->vma))
4289 goto split;
4290 if (vmf->vma->vm_ops->huge_fault) {
4291 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4292
4293 if (!(ret & VM_FAULT_FALLBACK))
4294 return ret;
4295 }
4296split:
4297
4298 __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
4299#endif
4300 return VM_FAULT_FALLBACK;
4301}
4302
4303static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
4304{
4305#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4306
4307 if (vma_is_anonymous(vmf->vma))
4308 return VM_FAULT_FALLBACK;
4309 if (vmf->vma->vm_ops->huge_fault)
4310 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4311#endif
4312 return VM_FAULT_FALLBACK;
4313}
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
4331{
4332 pte_t entry;
4333
4334 if (unlikely(pmd_none(*vmf->pmd))) {
4335
4336
4337
4338
4339
4340
4341 vmf->pte = NULL;
4342 } else {
4343
4344 if (pmd_devmap_trans_unstable(vmf->pmd))
4345 return 0;
4346
4347
4348
4349
4350
4351
4352 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4353 vmf->orig_pte = *vmf->pte;
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363 barrier();
4364 if (pte_none(vmf->orig_pte)) {
4365 pte_unmap(vmf->pte);
4366 vmf->pte = NULL;
4367 }
4368 }
4369
4370 if (!vmf->pte) {
4371 if (vma_is_anonymous(vmf->vma))
4372 return do_anonymous_page(vmf);
4373 else
4374 return do_fault(vmf);
4375 }
4376
4377 if (!pte_present(vmf->orig_pte))
4378 return do_swap_page(vmf);
4379
4380 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
4381 return do_numa_page(vmf);
4382
4383 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
4384 spin_lock(vmf->ptl);
4385 entry = vmf->orig_pte;
4386 if (unlikely(!pte_same(*vmf->pte, entry))) {
4387 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
4388 goto unlock;
4389 }
4390 if (vmf->flags & FAULT_FLAG_WRITE) {
4391 if (!pte_write(entry))
4392 return do_wp_page(vmf);
4393 entry = pte_mkdirty(entry);
4394 }
4395 entry = pte_mkyoung(entry);
4396 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
4397 vmf->flags & FAULT_FLAG_WRITE)) {
4398 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
4399 } else {
4400
4401 if (vmf->flags & FAULT_FLAG_TRIED)
4402 goto unlock;
4403
4404
4405
4406
4407
4408
4409 if (vmf->flags & FAULT_FLAG_WRITE)
4410 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
4411 }
4412unlock:
4413 pte_unmap_unlock(vmf->pte, vmf->ptl);
4414 return 0;
4415}
4416
4417
4418
4419
4420
4421
4422
4423static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
4424 unsigned long address, unsigned int flags)
4425{
4426 struct vm_fault vmf = {
4427 .vma = vma,
4428 .address = address & PAGE_MASK,
4429 .flags = flags,
4430 .pgoff = linear_page_index(vma, address),
4431 .gfp_mask = __get_fault_gfp_mask(vma),
4432 };
4433 unsigned int dirty = flags & FAULT_FLAG_WRITE;
4434 struct mm_struct *mm = vma->vm_mm;
4435 pgd_t *pgd;
4436 p4d_t *p4d;
4437 vm_fault_t ret;
4438
4439 pgd = pgd_offset(mm, address);
4440 p4d = p4d_alloc(mm, pgd, address);
4441 if (!p4d)
4442 return VM_FAULT_OOM;
4443
4444 vmf.pud = pud_alloc(mm, p4d, address);
4445 if (!vmf.pud)
4446 return VM_FAULT_OOM;
4447retry_pud:
4448 if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
4449 ret = create_huge_pud(&vmf);
4450 if (!(ret & VM_FAULT_FALLBACK))
4451 return ret;
4452 } else {
4453 pud_t orig_pud = *vmf.pud;
4454
4455 barrier();
4456 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
4457
4458
4459
4460 if (dirty && !pud_write(orig_pud)) {
4461 ret = wp_huge_pud(&vmf, orig_pud);
4462 if (!(ret & VM_FAULT_FALLBACK))
4463 return ret;
4464 } else {
4465 huge_pud_set_accessed(&vmf, orig_pud);
4466 return 0;
4467 }
4468 }
4469 }
4470
4471 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4472 if (!vmf.pmd)
4473 return VM_FAULT_OOM;
4474
4475
4476 if (pud_trans_unstable(vmf.pud))
4477 goto retry_pud;
4478
4479 if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
4480 ret = create_huge_pmd(&vmf);
4481 if (!(ret & VM_FAULT_FALLBACK))
4482 return ret;
4483 } else {
4484 pmd_t orig_pmd = *vmf.pmd;
4485
4486 barrier();
4487 if (unlikely(is_swap_pmd(orig_pmd))) {
4488 VM_BUG_ON(thp_migration_supported() &&
4489 !is_pmd_migration_entry(orig_pmd));
4490 if (is_pmd_migration_entry(orig_pmd))
4491 pmd_migration_entry_wait(mm, vmf.pmd);
4492 return 0;
4493 }
4494 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
4495 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
4496 return do_huge_pmd_numa_page(&vmf, orig_pmd);
4497
4498 if (dirty && !pmd_write(orig_pmd)) {
4499 ret = wp_huge_pmd(&vmf, orig_pmd);
4500 if (!(ret & VM_FAULT_FALLBACK))
4501 return ret;
4502 } else {
4503 huge_pmd_set_accessed(&vmf, orig_pmd);
4504 return 0;
4505 }
4506 }
4507 }
4508
4509 return handle_pte_fault(&vmf);
4510}
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527static inline void mm_account_fault(struct pt_regs *regs,
4528 unsigned long address, unsigned int flags,
4529 vm_fault_t ret)
4530{
4531 bool major;
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544 if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
4545 return;
4546
4547
4548
4549
4550
4551
4552 major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
4553
4554 if (major)
4555 current->maj_flt++;
4556 else
4557 current->min_flt++;
4558
4559
4560
4561
4562
4563
4564 if (!regs)
4565 return;
4566
4567 if (major)
4568 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
4569 else
4570 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
4571}
4572
4573
4574
4575
4576
4577
4578
4579vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4580 unsigned int flags, struct pt_regs *regs)
4581{
4582 vm_fault_t ret;
4583
4584 __set_current_state(TASK_RUNNING);
4585
4586 count_vm_event(PGFAULT);
4587 count_memcg_event_mm(vma->vm_mm, PGFAULT);
4588
4589
4590 check_sync_rss_stat(current);
4591
4592 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
4593 flags & FAULT_FLAG_INSTRUCTION,
4594 flags & FAULT_FLAG_REMOTE))
4595 return VM_FAULT_SIGSEGV;
4596
4597
4598
4599
4600
4601 if (flags & FAULT_FLAG_USER)
4602 mem_cgroup_enter_user_fault();
4603
4604 if (unlikely(is_vm_hugetlb_page(vma)))
4605 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4606 else
4607 ret = __handle_mm_fault(vma, address, flags);
4608
4609 if (flags & FAULT_FLAG_USER) {
4610 mem_cgroup_exit_user_fault();
4611
4612
4613
4614
4615
4616
4617 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4618 mem_cgroup_oom_synchronize(false);
4619 }
4620
4621 mm_account_fault(regs, address, flags, ret);
4622
4623 return ret;
4624}
4625EXPORT_SYMBOL_GPL(handle_mm_fault);
4626
4627#ifndef __PAGETABLE_P4D_FOLDED
4628
4629
4630
4631
4632int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4633{
4634 p4d_t *new = p4d_alloc_one(mm, address);
4635 if (!new)
4636 return -ENOMEM;
4637
4638 smp_wmb();
4639
4640 spin_lock(&mm->page_table_lock);
4641 if (pgd_present(*pgd))
4642 p4d_free(mm, new);
4643 else
4644 pgd_populate(mm, pgd, new);
4645 spin_unlock(&mm->page_table_lock);
4646 return 0;
4647}
4648#endif
4649
4650#ifndef __PAGETABLE_PUD_FOLDED
4651
4652
4653
4654
4655int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4656{
4657 pud_t *new = pud_alloc_one(mm, address);
4658 if (!new)
4659 return -ENOMEM;
4660
4661 smp_wmb();
4662
4663 spin_lock(&mm->page_table_lock);
4664 if (!p4d_present(*p4d)) {
4665 mm_inc_nr_puds(mm);
4666 p4d_populate(mm, p4d, new);
4667 } else
4668 pud_free(mm, new);
4669 spin_unlock(&mm->page_table_lock);
4670 return 0;
4671}
4672#endif
4673
4674#ifndef __PAGETABLE_PMD_FOLDED
4675
4676
4677
4678
4679int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4680{
4681 spinlock_t *ptl;
4682 pmd_t *new = pmd_alloc_one(mm, address);
4683 if (!new)
4684 return -ENOMEM;
4685
4686 smp_wmb();
4687
4688 ptl = pud_lock(mm, pud);
4689 if (!pud_present(*pud)) {
4690 mm_inc_nr_pmds(mm);
4691 pud_populate(mm, pud, new);
4692 } else
4693 pmd_free(mm, new);
4694 spin_unlock(ptl);
4695 return 0;
4696}
4697#endif
4698
4699static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4700 struct mmu_notifier_range *range,
4701 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4702{
4703 pgd_t *pgd;
4704 p4d_t *p4d;
4705 pud_t *pud;
4706 pmd_t *pmd;
4707 pte_t *ptep;
4708
4709 pgd = pgd_offset(mm, address);
4710 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4711 goto out;
4712
4713 p4d = p4d_offset(pgd, address);
4714 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4715 goto out;
4716
4717 pud = pud_offset(p4d, address);
4718 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4719 goto out;
4720
4721 pmd = pmd_offset(pud, address);
4722 VM_BUG_ON(pmd_trans_huge(*pmd));
4723
4724 if (pmd_huge(*pmd)) {
4725 if (!pmdpp)
4726 goto out;
4727
4728 if (range) {
4729 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
4730 NULL, mm, address & PMD_MASK,
4731 (address & PMD_MASK) + PMD_SIZE);
4732 mmu_notifier_invalidate_range_start(range);
4733 }
4734 *ptlp = pmd_lock(mm, pmd);
4735 if (pmd_huge(*pmd)) {
4736 *pmdpp = pmd;
4737 return 0;
4738 }
4739 spin_unlock(*ptlp);
4740 if (range)
4741 mmu_notifier_invalidate_range_end(range);
4742 }
4743
4744 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4745 goto out;
4746
4747 if (range) {
4748 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
4749 address & PAGE_MASK,
4750 (address & PAGE_MASK) + PAGE_SIZE);
4751 mmu_notifier_invalidate_range_start(range);
4752 }
4753 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4754 if (!pte_present(*ptep))
4755 goto unlock;
4756 *ptepp = ptep;
4757 return 0;
4758unlock:
4759 pte_unmap_unlock(ptep, *ptlp);
4760 if (range)
4761 mmu_notifier_invalidate_range_end(range);
4762out:
4763 return -EINVAL;
4764}
4765
4766static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4767 pte_t **ptepp, spinlock_t **ptlp)
4768{
4769 int res;
4770
4771
4772 (void) __cond_lock(*ptlp,
4773 !(res = __follow_pte_pmd(mm, address, NULL,
4774 ptepp, NULL, ptlp)));
4775 return res;
4776}
4777
4778int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4779 struct mmu_notifier_range *range,
4780 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4781{
4782 int res;
4783
4784
4785 (void) __cond_lock(*ptlp,
4786 !(res = __follow_pte_pmd(mm, address, range,
4787 ptepp, pmdpp, ptlp)));
4788 return res;
4789}
4790EXPORT_SYMBOL(follow_pte_pmd);
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4803 unsigned long *pfn)
4804{
4805 int ret = -EINVAL;
4806 spinlock_t *ptl;
4807 pte_t *ptep;
4808
4809 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4810 return ret;
4811
4812 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4813 if (ret)
4814 return ret;
4815 *pfn = pte_pfn(*ptep);
4816 pte_unmap_unlock(ptep, ptl);
4817 return 0;
4818}
4819EXPORT_SYMBOL(follow_pfn);
4820
4821#ifdef CONFIG_HAVE_IOREMAP_PROT
4822int follow_phys(struct vm_area_struct *vma,
4823 unsigned long address, unsigned int flags,
4824 unsigned long *prot, resource_size_t *phys)
4825{
4826 int ret = -EINVAL;
4827 pte_t *ptep, pte;
4828 spinlock_t *ptl;
4829
4830 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4831 goto out;
4832
4833 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4834 goto out;
4835 pte = *ptep;
4836
4837 if ((flags & FOLL_WRITE) && !pte_write(pte))
4838 goto unlock;
4839
4840 *prot = pgprot_val(pte_pgprot(pte));
4841 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4842
4843 ret = 0;
4844unlock:
4845 pte_unmap_unlock(ptep, ptl);
4846out:
4847 return ret;
4848}
4849
4850int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4851 void *buf, int len, int write)
4852{
4853 resource_size_t phys_addr;
4854 unsigned long prot = 0;
4855 void __iomem *maddr;
4856 int offset = addr & (PAGE_SIZE-1);
4857
4858 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4859 return -EINVAL;
4860
4861 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4862 if (!maddr)
4863 return -ENOMEM;
4864
4865 if (write)
4866 memcpy_toio(maddr + offset, buf, len);
4867 else
4868 memcpy_fromio(buf, maddr + offset, len);
4869 iounmap(maddr);
4870
4871 return len;
4872}
4873EXPORT_SYMBOL_GPL(generic_access_phys);
4874#endif
4875
4876
4877
4878
4879
4880int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4881 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4882{
4883 struct vm_area_struct *vma;
4884 void *old_buf = buf;
4885 int write = gup_flags & FOLL_WRITE;
4886
4887 if (mmap_read_lock_killable(mm))
4888 return 0;
4889
4890
4891 while (len) {
4892 int bytes, ret, offset;
4893 void *maddr;
4894 struct page *page = NULL;
4895
4896 ret = get_user_pages_remote(mm, addr, 1,
4897 gup_flags, &page, &vma, NULL);
4898 if (ret <= 0) {
4899#ifndef CONFIG_HAVE_IOREMAP_PROT
4900 break;
4901#else
4902
4903
4904
4905
4906 vma = find_vma(mm, addr);
4907 if (!vma || vma->vm_start > addr)
4908 break;
4909 if (vma->vm_ops && vma->vm_ops->access)
4910 ret = vma->vm_ops->access(vma, addr, buf,
4911 len, write);
4912 if (ret <= 0)
4913 break;
4914 bytes = ret;
4915#endif
4916 } else {
4917 bytes = len;
4918 offset = addr & (PAGE_SIZE-1);
4919 if (bytes > PAGE_SIZE-offset)
4920 bytes = PAGE_SIZE-offset;
4921
4922 maddr = kmap(page);
4923 if (write) {
4924 copy_to_user_page(vma, page, addr,
4925 maddr + offset, buf, bytes);
4926 set_page_dirty_lock(page);
4927 } else {
4928 copy_from_user_page(vma, page, addr,
4929 buf, maddr + offset, bytes);
4930 }
4931 kunmap(page);
4932 put_page(page);
4933 }
4934 len -= bytes;
4935 buf += bytes;
4936 addr += bytes;
4937 }
4938 mmap_read_unlock(mm);
4939
4940 return buf - old_buf;
4941}
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4956 void *buf, int len, unsigned int gup_flags)
4957{
4958 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4959}
4960
4961
4962
4963
4964
4965
4966int access_process_vm(struct task_struct *tsk, unsigned long addr,
4967 void *buf, int len, unsigned int gup_flags)
4968{
4969 struct mm_struct *mm;
4970 int ret;
4971
4972 mm = get_task_mm(tsk);
4973 if (!mm)
4974 return 0;
4975
4976 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4977
4978 mmput(mm);
4979
4980 return ret;
4981}
4982EXPORT_SYMBOL_GPL(access_process_vm);
4983
4984
4985
4986
4987void print_vma_addr(char *prefix, unsigned long ip)
4988{
4989 struct mm_struct *mm = current->mm;
4990 struct vm_area_struct *vma;
4991
4992
4993
4994
4995 if (!mmap_read_trylock(mm))
4996 return;
4997
4998 vma = find_vma(mm, ip);
4999 if (vma && vma->vm_file) {
5000 struct file *f = vma->vm_file;
5001 char *buf = (char *)__get_free_page(GFP_NOWAIT);
5002 if (buf) {
5003 char *p;
5004
5005 p = file_path(f, buf, PAGE_SIZE);
5006 if (IS_ERR(p))
5007 p = "?";
5008 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
5009 vma->vm_start,
5010 vma->vm_end - vma->vm_start);
5011 free_page((unsigned long)buf);
5012 }
5013 }
5014 mmap_read_unlock(mm);
5015}
5016
5017#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
5018void __might_fault(const char *file, int line)
5019{
5020
5021
5022
5023
5024
5025
5026 if (uaccess_kernel())
5027 return;
5028 if (pagefault_disabled())
5029 return;
5030 __might_sleep(file, line, 0);
5031#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
5032 if (current->mm)
5033 might_lock_read(¤t->mm->mmap_lock);
5034#endif
5035}
5036EXPORT_SYMBOL(__might_fault);
5037#endif
5038
5039#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
5040
5041
5042
5043
5044
5045static inline void process_huge_page(
5046 unsigned long addr_hint, unsigned int pages_per_huge_page,
5047 void (*process_subpage)(unsigned long addr, int idx, void *arg),
5048 void *arg)
5049{
5050 int i, n, base, l;
5051 unsigned long addr = addr_hint &
5052 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5053
5054
5055 might_sleep();
5056 n = (addr_hint - addr) / PAGE_SIZE;
5057 if (2 * n <= pages_per_huge_page) {
5058
5059 base = 0;
5060 l = n;
5061
5062 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
5063 cond_resched();
5064 process_subpage(addr + i * PAGE_SIZE, i, arg);
5065 }
5066 } else {
5067
5068 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
5069 l = pages_per_huge_page - n;
5070
5071 for (i = 0; i < base; i++) {
5072 cond_resched();
5073 process_subpage(addr + i * PAGE_SIZE, i, arg);
5074 }
5075 }
5076
5077
5078
5079
5080 for (i = 0; i < l; i++) {
5081 int left_idx = base + i;
5082 int right_idx = base + 2 * l - 1 - i;
5083
5084 cond_resched();
5085 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
5086 cond_resched();
5087 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
5088 }
5089}
5090
5091static void clear_gigantic_page(struct page *page,
5092 unsigned long addr,
5093 unsigned int pages_per_huge_page)
5094{
5095 int i;
5096 struct page *p = page;
5097
5098 might_sleep();
5099 for (i = 0; i < pages_per_huge_page;
5100 i++, p = mem_map_next(p, page, i)) {
5101 cond_resched();
5102 clear_user_highpage(p, addr + i * PAGE_SIZE);
5103 }
5104}
5105
5106static void clear_subpage(unsigned long addr, int idx, void *arg)
5107{
5108 struct page *page = arg;
5109
5110 clear_user_highpage(page + idx, addr);
5111}
5112
5113void clear_huge_page(struct page *page,
5114 unsigned long addr_hint, unsigned int pages_per_huge_page)
5115{
5116 unsigned long addr = addr_hint &
5117 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5118
5119 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
5120 clear_gigantic_page(page, addr, pages_per_huge_page);
5121 return;
5122 }
5123
5124 process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
5125}
5126
5127static void copy_user_gigantic_page(struct page *dst, struct page *src,
5128 unsigned long addr,
5129 struct vm_area_struct *vma,
5130 unsigned int pages_per_huge_page)
5131{
5132 int i;
5133 struct page *dst_base = dst;
5134 struct page *src_base = src;
5135
5136 for (i = 0; i < pages_per_huge_page; ) {
5137 cond_resched();
5138 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
5139
5140 i++;
5141 dst = mem_map_next(dst, dst_base, i);
5142 src = mem_map_next(src, src_base, i);
5143 }
5144}
5145
5146struct copy_subpage_arg {
5147 struct page *dst;
5148 struct page *src;
5149 struct vm_area_struct *vma;
5150};
5151
5152static void copy_subpage(unsigned long addr, int idx, void *arg)
5153{
5154 struct copy_subpage_arg *copy_arg = arg;
5155
5156 copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
5157 addr, copy_arg->vma);
5158}
5159
5160void copy_user_huge_page(struct page *dst, struct page *src,
5161 unsigned long addr_hint, struct vm_area_struct *vma,
5162 unsigned int pages_per_huge_page)
5163{
5164 unsigned long addr = addr_hint &
5165 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5166 struct copy_subpage_arg arg = {
5167 .dst = dst,
5168 .src = src,
5169 .vma = vma,
5170 };
5171
5172 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
5173 copy_user_gigantic_page(dst, src, addr, vma,
5174 pages_per_huge_page);
5175 return;
5176 }
5177
5178 process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
5179}
5180
5181long copy_huge_page_from_user(struct page *dst_page,
5182 const void __user *usr_src,
5183 unsigned int pages_per_huge_page,
5184 bool allow_pagefault)
5185{
5186 void *src = (void *)usr_src;
5187 void *page_kaddr;
5188 unsigned long i, rc = 0;
5189 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
5190
5191 for (i = 0; i < pages_per_huge_page; i++) {
5192 if (allow_pagefault)
5193 page_kaddr = kmap(dst_page + i);
5194 else
5195 page_kaddr = kmap_atomic(dst_page + i);
5196 rc = copy_from_user(page_kaddr,
5197 (const void __user *)(src + i * PAGE_SIZE),
5198 PAGE_SIZE);
5199 if (allow_pagefault)
5200 kunmap(dst_page + i);
5201 else
5202 kunmap_atomic(page_kaddr);
5203
5204 ret_val -= (PAGE_SIZE - rc);
5205 if (rc)
5206 break;
5207
5208 cond_resched();
5209 }
5210 return ret_val;
5211}
5212#endif
5213
5214#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
5215
5216static struct kmem_cache *page_ptl_cachep;
5217
5218void __init ptlock_cache_init(void)
5219{
5220 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
5221 SLAB_PANIC, NULL);
5222}
5223
5224bool ptlock_alloc(struct page *page)
5225{
5226 spinlock_t *ptl;
5227
5228 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
5229 if (!ptl)
5230 return false;
5231 page->ptl = ptl;
5232 return true;
5233}
5234
5235void ptlock_free(struct page *page)
5236{
5237 kmem_cache_free(page_ptl_cachep, page->ptl);
5238}
5239#endif
5240