1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/sched/signal.h>
10#include <linux/pagemap.h>
11#include <linux/rmap.h>
12#include <linux/swap.h>
13#include <linux/swapops.h>
14#include <linux/userfaultfd_k.h>
15#include <linux/mmu_notifier.h>
16#include <linux/hugetlb.h>
17#include <linux/shmem_fs.h>
18#include <asm/tlbflush.h>
19#include <asm/tlb.h>
20#include "internal.h"
21#include "swap.h"
22
23static __always_inline
24bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
25{
26
27 if (dst_end > dst_vma->vm_end)
28 return false;
29
30
31
32
33
34
35 if (!dst_vma->vm_userfaultfd_ctx.ctx)
36 return false;
37
38 return true;
39}
40
41static __always_inline
42struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
43 unsigned long addr)
44{
45 struct vm_area_struct *vma;
46
47 mmap_assert_locked(mm);
48 vma = vma_lookup(mm, addr);
49 if (!vma)
50 vma = ERR_PTR(-ENOENT);
51 else if (!(vma->vm_flags & VM_SHARED) &&
52 unlikely(anon_vma_prepare(vma)))
53 vma = ERR_PTR(-ENOMEM);
54
55 return vma;
56}
57
58#ifdef CONFIG_PER_VMA_LOCK
59
60
61
62
63
64
65
66
67
68
69static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
70 unsigned long address)
71{
72 struct vm_area_struct *vma;
73
74 vma = lock_vma_under_rcu(mm, address);
75 if (vma) {
76
77
78
79
80 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
81 vma_end_read(vma);
82 else
83 return vma;
84 }
85
86 mmap_read_lock(mm);
87 vma = find_vma_and_prepare_anon(mm, address);
88 if (!IS_ERR(vma)) {
89 bool locked = vma_start_read_locked(vma);
90
91 if (!locked)
92 vma = ERR_PTR(-EAGAIN);
93 }
94
95 mmap_read_unlock(mm);
96 return vma;
97}
98
99static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
100 unsigned long dst_start,
101 unsigned long len)
102{
103 struct vm_area_struct *dst_vma;
104
105 dst_vma = uffd_lock_vma(dst_mm, dst_start);
106 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
107 return dst_vma;
108
109 vma_end_read(dst_vma);
110 return ERR_PTR(-ENOENT);
111}
112
113static void uffd_mfill_unlock(struct vm_area_struct *vma)
114{
115 vma_end_read(vma);
116}
117
118#else
119
120static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
121 unsigned long dst_start,
122 unsigned long len)
123{
124 struct vm_area_struct *dst_vma;
125
126 mmap_read_lock(dst_mm);
127 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
128 if (IS_ERR(dst_vma))
129 goto out_unlock;
130
131 if (validate_dst_vma(dst_vma, dst_start + len))
132 return dst_vma;
133
134 dst_vma = ERR_PTR(-ENOENT);
135out_unlock:
136 mmap_read_unlock(dst_mm);
137 return dst_vma;
138}
139
140static void uffd_mfill_unlock(struct vm_area_struct *vma)
141{
142 mmap_read_unlock(vma->vm_mm);
143}
144#endif
145
146
147static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
148 unsigned long dst_addr)
149{
150 struct inode *inode;
151 pgoff_t offset, max_off;
152
153 if (!dst_vma->vm_file)
154 return false;
155
156 inode = dst_vma->vm_file->f_inode;
157 offset = linear_page_index(dst_vma, dst_addr);
158 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
159 return offset >= max_off;
160}
161
162
163
164
165
166
167
168int mfill_atomic_install_pte(pmd_t *dst_pmd,
169 struct vm_area_struct *dst_vma,
170 unsigned long dst_addr, struct page *page,
171 bool newly_allocated, uffd_flags_t flags)
172{
173 int ret;
174 struct mm_struct *dst_mm = dst_vma->vm_mm;
175 pte_t _dst_pte, *dst_pte;
176 bool writable = dst_vma->vm_flags & VM_WRITE;
177 bool vm_shared = dst_vma->vm_flags & VM_SHARED;
178 spinlock_t *ptl;
179 struct folio *folio = page_folio(page);
180 bool page_in_cache = folio_mapping(folio);
181
182 _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
183 _dst_pte = pte_mkdirty(_dst_pte);
184 if (page_in_cache && !vm_shared)
185 writable = false;
186 if (writable)
187 _dst_pte = pte_mkwrite(_dst_pte, dst_vma);
188 if (flags & MFILL_ATOMIC_WP)
189 _dst_pte = pte_mkuffd_wp(_dst_pte);
190
191 ret = -EAGAIN;
192 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
193 if (!dst_pte)
194 goto out;
195
196 if (mfill_file_over_size(dst_vma, dst_addr)) {
197 ret = -EFAULT;
198 goto out_unlock;
199 }
200
201 ret = -EEXIST;
202
203
204
205
206
207 if (!pte_none_mostly(ptep_get(dst_pte)))
208 goto out_unlock;
209
210 if (page_in_cache) {
211
212 if (newly_allocated)
213 folio_add_lru(folio);
214 folio_add_file_rmap_pte(folio, page, dst_vma);
215 } else {
216 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE);
217 folio_add_lru_vma(folio, dst_vma);
218 }
219
220
221
222
223
224 inc_mm_counter(dst_mm, mm_counter(folio));
225
226 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
227
228
229 update_mmu_cache(dst_vma, dst_addr, dst_pte);
230 ret = 0;
231out_unlock:
232 pte_unmap_unlock(dst_pte, ptl);
233out:
234 return ret;
235}
236
237static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
238 struct vm_area_struct *dst_vma,
239 unsigned long dst_addr,
240 unsigned long src_addr,
241 uffd_flags_t flags,
242 struct folio **foliop)
243{
244 void *kaddr;
245 int ret;
246 struct folio *folio;
247
248 if (!*foliop) {
249 ret = -ENOMEM;
250 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
251 dst_addr);
252 if (!folio)
253 goto out;
254
255 kaddr = kmap_local_folio(folio, 0);
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271 pagefault_disable();
272 ret = copy_from_user(kaddr, (const void __user *) src_addr,
273 PAGE_SIZE);
274 pagefault_enable();
275 kunmap_local(kaddr);
276
277
278 if (unlikely(ret)) {
279 ret = -ENOENT;
280 *foliop = folio;
281
282 goto out;
283 }
284
285 flush_dcache_folio(folio);
286 } else {
287 folio = *foliop;
288 *foliop = NULL;
289 }
290
291
292
293
294
295
296 __folio_mark_uptodate(folio);
297
298 ret = -ENOMEM;
299 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
300 goto out_release;
301
302 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
303 &folio->page, true, flags);
304 if (ret)
305 goto out_release;
306out:
307 return ret;
308out_release:
309 folio_put(folio);
310 goto out;
311}
312
313static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
314 struct vm_area_struct *dst_vma,
315 unsigned long dst_addr)
316{
317 struct folio *folio;
318 int ret = -ENOMEM;
319
320 folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
321 if (!folio)
322 return ret;
323
324 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
325 goto out_put;
326
327
328
329
330
331
332 __folio_mark_uptodate(folio);
333
334 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
335 &folio->page, true, 0);
336 if (ret)
337 goto out_put;
338
339 return 0;
340out_put:
341 folio_put(folio);
342 return ret;
343}
344
345static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
346 struct vm_area_struct *dst_vma,
347 unsigned long dst_addr)
348{
349 pte_t _dst_pte, *dst_pte;
350 spinlock_t *ptl;
351 int ret;
352
353 if (mm_forbids_zeropage(dst_vma->vm_mm))
354 return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
355
356 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
357 dst_vma->vm_page_prot));
358 ret = -EAGAIN;
359 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
360 if (!dst_pte)
361 goto out;
362 if (mfill_file_over_size(dst_vma, dst_addr)) {
363 ret = -EFAULT;
364 goto out_unlock;
365 }
366 ret = -EEXIST;
367 if (!pte_none(ptep_get(dst_pte)))
368 goto out_unlock;
369 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
370
371 update_mmu_cache(dst_vma, dst_addr, dst_pte);
372 ret = 0;
373out_unlock:
374 pte_unmap_unlock(dst_pte, ptl);
375out:
376 return ret;
377}
378
379
380static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
381 struct vm_area_struct *dst_vma,
382 unsigned long dst_addr,
383 uffd_flags_t flags)
384{
385 struct inode *inode = file_inode(dst_vma->vm_file);
386 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
387 struct folio *folio;
388 struct page *page;
389 int ret;
390
391 ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
392
393 if (ret == -ENOENT)
394 ret = -EFAULT;
395 if (ret)
396 goto out;
397 if (!folio) {
398 ret = -EFAULT;
399 goto out;
400 }
401
402 page = folio_file_page(folio, pgoff);
403 if (PageHWPoison(page)) {
404 ret = -EIO;
405 goto out_release;
406 }
407
408 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
409 page, false, flags);
410 if (ret)
411 goto out_release;
412
413 folio_unlock(folio);
414 ret = 0;
415out:
416 return ret;
417out_release:
418 folio_unlock(folio);
419 folio_put(folio);
420 goto out;
421}
422
423
424static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
425 struct vm_area_struct *dst_vma,
426 unsigned long dst_addr,
427 uffd_flags_t flags)
428{
429 int ret;
430 struct mm_struct *dst_mm = dst_vma->vm_mm;
431 pte_t _dst_pte, *dst_pte;
432 spinlock_t *ptl;
433
434 _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
435 ret = -EAGAIN;
436 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
437 if (!dst_pte)
438 goto out;
439
440 if (mfill_file_over_size(dst_vma, dst_addr)) {
441 ret = -EFAULT;
442 goto out_unlock;
443 }
444
445 ret = -EEXIST;
446
447 if (!pte_none(ptep_get(dst_pte)))
448 goto out_unlock;
449
450 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
451
452
453 update_mmu_cache(dst_vma, dst_addr, dst_pte);
454 ret = 0;
455out_unlock:
456 pte_unmap_unlock(dst_pte, ptl);
457out:
458 return ret;
459}
460
461static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
462{
463 pgd_t *pgd;
464 p4d_t *p4d;
465 pud_t *pud;
466
467 pgd = pgd_offset(mm, address);
468 p4d = p4d_alloc(mm, pgd, address);
469 if (!p4d)
470 return NULL;
471 pud = pud_alloc(mm, p4d, address);
472 if (!pud)
473 return NULL;
474
475
476
477
478
479 return pmd_alloc(mm, pud, address);
480}
481
482#ifdef CONFIG_HUGETLB_PAGE
483
484
485
486
487
488static __always_inline ssize_t mfill_atomic_hugetlb(
489 struct userfaultfd_ctx *ctx,
490 struct vm_area_struct *dst_vma,
491 unsigned long dst_start,
492 unsigned long src_start,
493 unsigned long len,
494 uffd_flags_t flags)
495{
496 struct mm_struct *dst_mm = dst_vma->vm_mm;
497 ssize_t err;
498 pte_t *dst_pte;
499 unsigned long src_addr, dst_addr;
500 long copied;
501 struct folio *folio;
502 unsigned long vma_hpagesize;
503 pgoff_t idx;
504 u32 hash;
505 struct address_space *mapping;
506
507
508
509
510
511
512
513 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
514 up_read(&ctx->map_changing_lock);
515 uffd_mfill_unlock(dst_vma);
516 return -EINVAL;
517 }
518
519 src_addr = src_start;
520 dst_addr = dst_start;
521 copied = 0;
522 folio = NULL;
523 vma_hpagesize = vma_kernel_pagesize(dst_vma);
524
525
526
527
528 err = -EINVAL;
529 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
530 goto out_unlock;
531
532retry:
533
534
535
536
537 if (!dst_vma) {
538 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
539 if (IS_ERR(dst_vma)) {
540 err = PTR_ERR(dst_vma);
541 goto out;
542 }
543
544 err = -ENOENT;
545 if (!is_vm_hugetlb_page(dst_vma))
546 goto out_unlock_vma;
547
548 err = -EINVAL;
549 if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
550 goto out_unlock_vma;
551
552
553
554
555
556
557 down_read(&ctx->map_changing_lock);
558 err = -EAGAIN;
559 if (atomic_read(&ctx->mmap_changing))
560 goto out_unlock;
561 }
562
563 while (src_addr < src_start + len) {
564 VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
565
566
567
568
569
570
571
572 idx = linear_page_index(dst_vma, dst_addr);
573 mapping = dst_vma->vm_file->f_mapping;
574 hash = hugetlb_fault_mutex_hash(mapping, idx);
575 mutex_lock(&hugetlb_fault_mutex_table[hash]);
576 hugetlb_vma_lock_read(dst_vma);
577
578 err = -ENOMEM;
579 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
580 if (!dst_pte) {
581 hugetlb_vma_unlock_read(dst_vma);
582 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
583 goto out_unlock;
584 }
585
586 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
587 !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
588 err = -EEXIST;
589 hugetlb_vma_unlock_read(dst_vma);
590 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
591 goto out_unlock;
592 }
593
594 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
595 src_addr, flags, &folio);
596
597 hugetlb_vma_unlock_read(dst_vma);
598 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
599
600 cond_resched();
601
602 if (unlikely(err == -ENOENT)) {
603 up_read(&ctx->map_changing_lock);
604 uffd_mfill_unlock(dst_vma);
605 VM_WARN_ON_ONCE(!folio);
606
607 err = copy_folio_from_user(folio,
608 (const void __user *)src_addr, true);
609 if (unlikely(err)) {
610 err = -EFAULT;
611 goto out;
612 }
613
614 dst_vma = NULL;
615 goto retry;
616 } else
617 VM_WARN_ON_ONCE(folio);
618
619 if (!err) {
620 dst_addr += vma_hpagesize;
621 src_addr += vma_hpagesize;
622 copied += vma_hpagesize;
623
624 if (fatal_signal_pending(current))
625 err = -EINTR;
626 }
627 if (err)
628 break;
629 }
630
631out_unlock:
632 up_read(&ctx->map_changing_lock);
633out_unlock_vma:
634 uffd_mfill_unlock(dst_vma);
635out:
636 if (folio)
637 folio_put(folio);
638 VM_WARN_ON_ONCE(copied < 0);
639 VM_WARN_ON_ONCE(err > 0);
640 VM_WARN_ON_ONCE(!copied && !err);
641 return copied ? copied : err;
642}
643#else
644
645extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
646 struct vm_area_struct *dst_vma,
647 unsigned long dst_start,
648 unsigned long src_start,
649 unsigned long len,
650 uffd_flags_t flags);
651#endif
652
653static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
654 struct vm_area_struct *dst_vma,
655 unsigned long dst_addr,
656 unsigned long src_addr,
657 uffd_flags_t flags,
658 struct folio **foliop)
659{
660 ssize_t err;
661
662 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
663 return mfill_atomic_pte_continue(dst_pmd, dst_vma,
664 dst_addr, flags);
665 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
666 return mfill_atomic_pte_poison(dst_pmd, dst_vma,
667 dst_addr, flags);
668 }
669
670
671
672
673
674
675
676
677
678
679
680 if (!(dst_vma->vm_flags & VM_SHARED)) {
681 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
682 err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
683 dst_addr, src_addr,
684 flags, foliop);
685 else
686 err = mfill_atomic_pte_zeropage(dst_pmd,
687 dst_vma, dst_addr);
688 } else {
689 err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
690 dst_addr, src_addr,
691 flags, foliop);
692 }
693
694 return err;
695}
696
697static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
698 unsigned long dst_start,
699 unsigned long src_start,
700 unsigned long len,
701 uffd_flags_t flags)
702{
703 struct mm_struct *dst_mm = ctx->mm;
704 struct vm_area_struct *dst_vma;
705 ssize_t err;
706 pmd_t *dst_pmd;
707 unsigned long src_addr, dst_addr;
708 long copied;
709 struct folio *folio;
710
711
712
713
714 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
715 VM_WARN_ON_ONCE(len & ~PAGE_MASK);
716
717
718 VM_WARN_ON_ONCE(src_start + len <= src_start);
719 VM_WARN_ON_ONCE(dst_start + len <= dst_start);
720
721 src_addr = src_start;
722 dst_addr = dst_start;
723 copied = 0;
724 folio = NULL;
725retry:
726
727
728
729
730 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
731 if (IS_ERR(dst_vma)) {
732 err = PTR_ERR(dst_vma);
733 goto out;
734 }
735
736
737
738
739
740
741 down_read(&ctx->map_changing_lock);
742 err = -EAGAIN;
743 if (atomic_read(&ctx->mmap_changing))
744 goto out_unlock;
745
746 err = -EINVAL;
747
748
749
750
751 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
752 dst_vma->vm_flags & VM_SHARED))
753 goto out_unlock;
754
755
756
757
758
759 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
760 goto out_unlock;
761
762
763
764
765 if (is_vm_hugetlb_page(dst_vma))
766 return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
767 src_start, len, flags);
768
769 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
770 goto out_unlock;
771 if (!vma_is_shmem(dst_vma) &&
772 uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
773 goto out_unlock;
774
775 while (src_addr < src_start + len) {
776 pmd_t dst_pmdval;
777
778 VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
779
780 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
781 if (unlikely(!dst_pmd)) {
782 err = -ENOMEM;
783 break;
784 }
785
786 dst_pmdval = pmdp_get_lockless(dst_pmd);
787 if (unlikely(pmd_none(dst_pmdval)) &&
788 unlikely(__pte_alloc(dst_mm, dst_pmd))) {
789 err = -ENOMEM;
790 break;
791 }
792 dst_pmdval = pmdp_get_lockless(dst_pmd);
793
794
795
796
797
798 if (unlikely(!pmd_present(dst_pmdval) ||
799 pmd_trans_huge(dst_pmdval))) {
800 err = -EEXIST;
801 break;
802 }
803 if (unlikely(pmd_bad(dst_pmdval))) {
804 err = -EFAULT;
805 break;
806 }
807
808
809
810
811
812 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
813 src_addr, flags, &folio);
814 cond_resched();
815
816 if (unlikely(err == -ENOENT)) {
817 void *kaddr;
818
819 up_read(&ctx->map_changing_lock);
820 uffd_mfill_unlock(dst_vma);
821 VM_WARN_ON_ONCE(!folio);
822
823 kaddr = kmap_local_folio(folio, 0);
824 err = copy_from_user(kaddr,
825 (const void __user *) src_addr,
826 PAGE_SIZE);
827 kunmap_local(kaddr);
828 if (unlikely(err)) {
829 err = -EFAULT;
830 goto out;
831 }
832 flush_dcache_folio(folio);
833 goto retry;
834 } else
835 VM_WARN_ON_ONCE(folio);
836
837 if (!err) {
838 dst_addr += PAGE_SIZE;
839 src_addr += PAGE_SIZE;
840 copied += PAGE_SIZE;
841
842 if (fatal_signal_pending(current))
843 err = -EINTR;
844 }
845 if (err)
846 break;
847 }
848
849out_unlock:
850 up_read(&ctx->map_changing_lock);
851 uffd_mfill_unlock(dst_vma);
852out:
853 if (folio)
854 folio_put(folio);
855 VM_WARN_ON_ONCE(copied < 0);
856 VM_WARN_ON_ONCE(err > 0);
857 VM_WARN_ON_ONCE(!copied && !err);
858 return copied ? copied : err;
859}
860
861ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
862 unsigned long src_start, unsigned long len,
863 uffd_flags_t flags)
864{
865 return mfill_atomic(ctx, dst_start, src_start, len,
866 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
867}
868
869ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
870 unsigned long start,
871 unsigned long len)
872{
873 return mfill_atomic(ctx, start, 0, len,
874 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
875}
876
877ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
878 unsigned long len, uffd_flags_t flags)
879{
880
881
882
883
884
885
886
887 smp_wmb();
888
889 return mfill_atomic(ctx, start, 0, len,
890 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
891}
892
893ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
894 unsigned long len, uffd_flags_t flags)
895{
896 return mfill_atomic(ctx, start, 0, len,
897 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
898}
899
900long uffd_wp_range(struct vm_area_struct *dst_vma,
901 unsigned long start, unsigned long len, bool enable_wp)
902{
903 unsigned int mm_cp_flags;
904 struct mmu_gather tlb;
905 long ret;
906
907 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
908 "The address range exceeds VMA boundary.\n");
909 if (enable_wp)
910 mm_cp_flags = MM_CP_UFFD_WP;
911 else
912 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
913
914
915
916
917
918
919
920 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
921 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
922 tlb_gather_mmu(&tlb, dst_vma->vm_mm);
923 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
924 tlb_finish_mmu(&tlb);
925
926 return ret;
927}
928
929int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
930 unsigned long len, bool enable_wp)
931{
932 struct mm_struct *dst_mm = ctx->mm;
933 unsigned long end = start + len;
934 unsigned long _start, _end;
935 struct vm_area_struct *dst_vma;
936 unsigned long page_mask;
937 long err;
938 VMA_ITERATOR(vmi, dst_mm, start);
939
940
941
942
943 VM_WARN_ON_ONCE(start & ~PAGE_MASK);
944 VM_WARN_ON_ONCE(len & ~PAGE_MASK);
945
946
947 VM_WARN_ON_ONCE(start + len <= start);
948
949 mmap_read_lock(dst_mm);
950
951
952
953
954
955
956 down_read(&ctx->map_changing_lock);
957 err = -EAGAIN;
958 if (atomic_read(&ctx->mmap_changing))
959 goto out_unlock;
960
961 err = -ENOENT;
962 for_each_vma_range(vmi, dst_vma, end) {
963
964 if (!userfaultfd_wp(dst_vma)) {
965 err = -ENOENT;
966 break;
967 }
968
969 if (is_vm_hugetlb_page(dst_vma)) {
970 err = -EINVAL;
971 page_mask = vma_kernel_pagesize(dst_vma) - 1;
972 if ((start & page_mask) || (len & page_mask))
973 break;
974 }
975
976 _start = max(dst_vma->vm_start, start);
977 _end = min(dst_vma->vm_end, end);
978
979 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
980
981
982 if (err < 0)
983 break;
984 err = 0;
985 }
986out_unlock:
987 up_read(&ctx->map_changing_lock);
988 mmap_read_unlock(dst_mm);
989 return err;
990}
991
992
993void double_pt_lock(spinlock_t *ptl1,
994 spinlock_t *ptl2)
995 __acquires(ptl1)
996 __acquires(ptl2)
997{
998 if (ptl1 > ptl2)
999 swap(ptl1, ptl2);
1000
1001 spin_lock(ptl1);
1002 if (ptl1 != ptl2)
1003 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING);
1004 else
1005 __acquire(ptl2);
1006}
1007
1008void double_pt_unlock(spinlock_t *ptl1,
1009 spinlock_t *ptl2)
1010 __releases(ptl1)
1011 __releases(ptl2)
1012{
1013 spin_unlock(ptl1);
1014 if (ptl1 != ptl2)
1015 spin_unlock(ptl2);
1016 else
1017 __release(ptl2);
1018}
1019
1020static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
1021 pte_t orig_dst_pte, pte_t orig_src_pte,
1022 pmd_t *dst_pmd, pmd_t dst_pmdval)
1023{
1024 return pte_same(ptep_get(src_pte), orig_src_pte) &&
1025 pte_same(ptep_get(dst_pte), orig_dst_pte) &&
1026 pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
1027}
1028
1029static int move_present_pte(struct mm_struct *mm,
1030 struct vm_area_struct *dst_vma,
1031 struct vm_area_struct *src_vma,
1032 unsigned long dst_addr, unsigned long src_addr,
1033 pte_t *dst_pte, pte_t *src_pte,
1034 pte_t orig_dst_pte, pte_t orig_src_pte,
1035 pmd_t *dst_pmd, pmd_t dst_pmdval,
1036 spinlock_t *dst_ptl, spinlock_t *src_ptl,
1037 struct folio *src_folio)
1038{
1039 int err = 0;
1040
1041 double_pt_lock(dst_ptl, src_ptl);
1042
1043 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1044 dst_pmd, dst_pmdval)) {
1045 err = -EAGAIN;
1046 goto out;
1047 }
1048 if (folio_test_large(src_folio) ||
1049 folio_maybe_dma_pinned(src_folio) ||
1050 !PageAnonExclusive(&src_folio->page)) {
1051 err = -EBUSY;
1052 goto out;
1053 }
1054
1055 orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
1056
1057 if (folio_maybe_dma_pinned(src_folio)) {
1058 set_pte_at(mm, src_addr, src_pte, orig_src_pte);
1059 err = -EBUSY;
1060 goto out;
1061 }
1062
1063 folio_move_anon_rmap(src_folio, dst_vma);
1064 src_folio->index = linear_page_index(dst_vma, dst_addr);
1065
1066 orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
1067
1068#ifdef CONFIG_MEM_SOFT_DIRTY
1069 orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
1070#endif
1071 if (pte_dirty(orig_src_pte))
1072 orig_dst_pte = pte_mkdirty(orig_dst_pte);
1073 orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
1074
1075 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
1076out:
1077 double_pt_unlock(dst_ptl, src_ptl);
1078 return err;
1079}
1080
1081static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
1082 unsigned long dst_addr, unsigned long src_addr,
1083 pte_t *dst_pte, pte_t *src_pte,
1084 pte_t orig_dst_pte, pte_t orig_src_pte,
1085 pmd_t *dst_pmd, pmd_t dst_pmdval,
1086 spinlock_t *dst_ptl, spinlock_t *src_ptl,
1087 struct folio *src_folio,
1088 struct swap_info_struct *si, swp_entry_t entry)
1089{
1090
1091
1092
1093
1094
1095 if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
1096 entry.val != src_folio->swap.val))
1097 return -EAGAIN;
1098
1099 double_pt_lock(dst_ptl, src_ptl);
1100
1101 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1102 dst_pmd, dst_pmdval)) {
1103 double_pt_unlock(dst_ptl, src_ptl);
1104 return -EAGAIN;
1105 }
1106
1107
1108
1109
1110
1111
1112 if (src_folio) {
1113 folio_move_anon_rmap(src_folio, dst_vma);
1114 src_folio->index = linear_page_index(dst_vma, dst_addr);
1115 } else {
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130 if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
1131 double_pt_unlock(dst_ptl, src_ptl);
1132 return -EAGAIN;
1133 }
1134 }
1135
1136 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
1137#ifdef CONFIG_MEM_SOFT_DIRTY
1138 orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
1139#endif
1140 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
1141 double_pt_unlock(dst_ptl, src_ptl);
1142
1143 return 0;
1144}
1145
1146static int move_zeropage_pte(struct mm_struct *mm,
1147 struct vm_area_struct *dst_vma,
1148 struct vm_area_struct *src_vma,
1149 unsigned long dst_addr, unsigned long src_addr,
1150 pte_t *dst_pte, pte_t *src_pte,
1151 pte_t orig_dst_pte, pte_t orig_src_pte,
1152 pmd_t *dst_pmd, pmd_t dst_pmdval,
1153 spinlock_t *dst_ptl, spinlock_t *src_ptl)
1154{
1155 pte_t zero_pte;
1156
1157 double_pt_lock(dst_ptl, src_ptl);
1158 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1159 dst_pmd, dst_pmdval)) {
1160 double_pt_unlock(dst_ptl, src_ptl);
1161 return -EAGAIN;
1162 }
1163
1164 zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
1165 dst_vma->vm_page_prot));
1166 ptep_clear_flush(src_vma, src_addr, src_pte);
1167 set_pte_at(mm, dst_addr, dst_pte, zero_pte);
1168 double_pt_unlock(dst_ptl, src_ptl);
1169
1170 return 0;
1171}
1172
1173
1174
1175
1176
1177
1178
1179static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
1180 struct vm_area_struct *dst_vma,
1181 struct vm_area_struct *src_vma,
1182 unsigned long dst_addr, unsigned long src_addr,
1183 __u64 mode)
1184{
1185 swp_entry_t entry;
1186 struct swap_info_struct *si = NULL;
1187 pte_t orig_src_pte, orig_dst_pte;
1188 pte_t src_folio_pte;
1189 spinlock_t *src_ptl, *dst_ptl;
1190 pte_t *src_pte = NULL;
1191 pte_t *dst_pte = NULL;
1192 pmd_t dummy_pmdval;
1193 pmd_t dst_pmdval;
1194 struct folio *src_folio = NULL;
1195 struct anon_vma *src_anon_vma = NULL;
1196 struct mmu_notifier_range range;
1197 int err = 0;
1198
1199 flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
1200 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
1201 src_addr, src_addr + PAGE_SIZE);
1202 mmu_notifier_invalidate_range_start(&range);
1203retry:
1204
1205
1206
1207
1208
1209
1210 dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval,
1211 &dst_ptl);
1212
1213
1214 if (unlikely(!dst_pte)) {
1215 err = -EAGAIN;
1216 goto out;
1217 }
1218
1219
1220
1221
1222
1223
1224 src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval,
1225 &src_ptl);
1226
1227
1228
1229
1230
1231
1232
1233 if (unlikely(!src_pte)) {
1234 err = -EAGAIN;
1235 goto out;
1236 }
1237
1238
1239 if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
1240 pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
1241 err = -EINVAL;
1242 goto out;
1243 }
1244
1245 spin_lock(dst_ptl);
1246 orig_dst_pte = ptep_get(dst_pte);
1247 spin_unlock(dst_ptl);
1248 if (!pte_none(orig_dst_pte)) {
1249 err = -EEXIST;
1250 goto out;
1251 }
1252
1253 spin_lock(src_ptl);
1254 orig_src_pte = ptep_get(src_pte);
1255 spin_unlock(src_ptl);
1256 if (pte_none(orig_src_pte)) {
1257 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
1258 err = -ENOENT;
1259 else
1260 err = 0;
1261 goto out;
1262 }
1263
1264
1265 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
1266 err = -EAGAIN;
1267 goto out;
1268 }
1269
1270 if (pte_present(orig_src_pte)) {
1271 if (is_zero_pfn(pte_pfn(orig_src_pte))) {
1272 err = move_zeropage_pte(mm, dst_vma, src_vma,
1273 dst_addr, src_addr, dst_pte, src_pte,
1274 orig_dst_pte, orig_src_pte,
1275 dst_pmd, dst_pmdval, dst_ptl, src_ptl);
1276 goto out;
1277 }
1278
1279
1280
1281
1282
1283
1284 if (!src_folio) {
1285 struct folio *folio;
1286 bool locked;
1287
1288
1289
1290
1291
1292 spin_lock(src_ptl);
1293 if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
1294 spin_unlock(src_ptl);
1295 err = -EAGAIN;
1296 goto out;
1297 }
1298
1299 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
1300 if (!folio || !PageAnonExclusive(&folio->page)) {
1301 spin_unlock(src_ptl);
1302 err = -EBUSY;
1303 goto out;
1304 }
1305
1306 locked = folio_trylock(folio);
1307
1308
1309
1310
1311
1312
1313
1314 if (!locked && folio_test_large(folio)) {
1315 spin_unlock(src_ptl);
1316 err = -EAGAIN;
1317 goto out;
1318 }
1319
1320 folio_get(folio);
1321 src_folio = folio;
1322 src_folio_pte = orig_src_pte;
1323 spin_unlock(src_ptl);
1324
1325 if (!locked) {
1326 pte_unmap(src_pte);
1327 pte_unmap(dst_pte);
1328 src_pte = dst_pte = NULL;
1329
1330 folio_lock(src_folio);
1331 goto retry;
1332 }
1333
1334 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
1335 err = -EBUSY;
1336 goto out;
1337 }
1338 }
1339
1340
1341 if (folio_test_large(src_folio)) {
1342
1343 pte_unmap(src_pte);
1344 pte_unmap(dst_pte);
1345 src_pte = dst_pte = NULL;
1346 err = split_folio(src_folio);
1347 if (err)
1348 goto out;
1349
1350 folio_unlock(src_folio);
1351 folio_put(src_folio);
1352 src_folio = NULL;
1353 goto retry;
1354 }
1355
1356 if (!src_anon_vma) {
1357
1358
1359
1360
1361
1362 src_anon_vma = folio_get_anon_vma(src_folio);
1363 if (!src_anon_vma) {
1364
1365 err = -EAGAIN;
1366 goto out;
1367 }
1368 if (!anon_vma_trylock_write(src_anon_vma)) {
1369 pte_unmap(src_pte);
1370 pte_unmap(dst_pte);
1371 src_pte = dst_pte = NULL;
1372
1373 anon_vma_lock_write(src_anon_vma);
1374 goto retry;
1375 }
1376 }
1377
1378 err = move_present_pte(mm, dst_vma, src_vma,
1379 dst_addr, src_addr, dst_pte, src_pte,
1380 orig_dst_pte, orig_src_pte, dst_pmd,
1381 dst_pmdval, dst_ptl, src_ptl, src_folio);
1382 } else {
1383 struct folio *folio = NULL;
1384
1385 entry = pte_to_swp_entry(orig_src_pte);
1386 if (non_swap_entry(entry)) {
1387 if (is_migration_entry(entry)) {
1388 pte_unmap(src_pte);
1389 pte_unmap(dst_pte);
1390 src_pte = dst_pte = NULL;
1391 migration_entry_wait(mm, src_pmd, src_addr);
1392 err = -EAGAIN;
1393 } else
1394 err = -EFAULT;
1395 goto out;
1396 }
1397
1398 if (!pte_swp_exclusive(orig_src_pte)) {
1399 err = -EBUSY;
1400 goto out;
1401 }
1402
1403 si = get_swap_device(entry);
1404 if (unlikely(!si)) {
1405 err = -EAGAIN;
1406 goto out;
1407 }
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420 if (!src_folio)
1421 folio = filemap_get_folio(swap_address_space(entry),
1422 swap_cache_index(entry));
1423 if (!IS_ERR_OR_NULL(folio)) {
1424 if (folio_test_large(folio)) {
1425 err = -EBUSY;
1426 folio_put(folio);
1427 goto out;
1428 }
1429 src_folio = folio;
1430 src_folio_pte = orig_src_pte;
1431 if (!folio_trylock(src_folio)) {
1432 pte_unmap(src_pte);
1433 pte_unmap(dst_pte);
1434 src_pte = dst_pte = NULL;
1435 put_swap_device(si);
1436 si = NULL;
1437
1438 folio_lock(src_folio);
1439 goto retry;
1440 }
1441 }
1442 err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
1443 orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
1444 dst_ptl, src_ptl, src_folio, si, entry);
1445 }
1446
1447out:
1448 if (src_anon_vma) {
1449 anon_vma_unlock_write(src_anon_vma);
1450 put_anon_vma(src_anon_vma);
1451 }
1452 if (src_folio) {
1453 folio_unlock(src_folio);
1454 folio_put(src_folio);
1455 }
1456
1457
1458
1459
1460
1461 if (src_pte)
1462 pte_unmap(src_pte);
1463 if (dst_pte)
1464 pte_unmap(dst_pte);
1465 mmu_notifier_invalidate_range_end(&range);
1466 if (si)
1467 put_swap_device(si);
1468
1469 return err;
1470}
1471
1472#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1473static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1474 unsigned long src_addr,
1475 unsigned long src_end)
1476{
1477 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
1478 src_end - src_addr < HPAGE_PMD_SIZE;
1479}
1480#else
1481static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1482 unsigned long src_addr,
1483 unsigned long src_end)
1484{
1485
1486 return false;
1487}
1488#endif
1489
1490static inline bool vma_move_compatible(struct vm_area_struct *vma)
1491{
1492 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB |
1493 VM_MIXEDMAP | VM_SHADOW_STACK));
1494}
1495
1496static int validate_move_areas(struct userfaultfd_ctx *ctx,
1497 struct vm_area_struct *src_vma,
1498 struct vm_area_struct *dst_vma)
1499{
1500
1501 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) ||
1502 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot))
1503 return -EINVAL;
1504
1505
1506 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED))
1507 return -EINVAL;
1508
1509
1510
1511
1512
1513 if (!(src_vma->vm_flags & VM_WRITE))
1514 return -EINVAL;
1515
1516
1517 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma))
1518 return -EINVAL;
1519
1520
1521 if (!dst_vma->vm_userfaultfd_ctx.ctx ||
1522 dst_vma->vm_userfaultfd_ctx.ctx != ctx)
1523 return -EINVAL;
1524
1525
1526 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
1527 return -EINVAL;
1528
1529 return 0;
1530}
1531
1532static __always_inline
1533int find_vmas_mm_locked(struct mm_struct *mm,
1534 unsigned long dst_start,
1535 unsigned long src_start,
1536 struct vm_area_struct **dst_vmap,
1537 struct vm_area_struct **src_vmap)
1538{
1539 struct vm_area_struct *vma;
1540
1541 mmap_assert_locked(mm);
1542 vma = find_vma_and_prepare_anon(mm, dst_start);
1543 if (IS_ERR(vma))
1544 return PTR_ERR(vma);
1545
1546 *dst_vmap = vma;
1547
1548 if (src_start >= vma->vm_start && src_start < vma->vm_end)
1549 goto out_success;
1550
1551 vma = vma_lookup(mm, src_start);
1552 if (!vma)
1553 return -ENOENT;
1554out_success:
1555 *src_vmap = vma;
1556 return 0;
1557}
1558
1559#ifdef CONFIG_PER_VMA_LOCK
1560static int uffd_move_lock(struct mm_struct *mm,
1561 unsigned long dst_start,
1562 unsigned long src_start,
1563 struct vm_area_struct **dst_vmap,
1564 struct vm_area_struct **src_vmap)
1565{
1566 struct vm_area_struct *vma;
1567 int err;
1568
1569 vma = uffd_lock_vma(mm, dst_start);
1570 if (IS_ERR(vma))
1571 return PTR_ERR(vma);
1572
1573 *dst_vmap = vma;
1574
1575
1576
1577
1578 if (src_start >= vma->vm_start && src_start < vma->vm_end) {
1579 *src_vmap = vma;
1580 return 0;
1581 }
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595 *src_vmap = lock_vma_under_rcu(mm, src_start);
1596 if (likely(*src_vmap))
1597 return 0;
1598
1599
1600 vma_end_read(*dst_vmap);
1601
1602 mmap_read_lock(mm);
1603 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1604 if (err)
1605 goto out;
1606
1607 if (!vma_start_read_locked(*dst_vmap)) {
1608 err = -EAGAIN;
1609 goto out;
1610 }
1611
1612
1613 if (*dst_vmap == *src_vmap)
1614 goto out;
1615
1616 if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) {
1617
1618 vma_end_read(*dst_vmap);
1619 err = -EAGAIN;
1620 }
1621out:
1622 mmap_read_unlock(mm);
1623 return err;
1624}
1625
1626static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1627 struct vm_area_struct *src_vma)
1628{
1629 vma_end_read(src_vma);
1630 if (src_vma != dst_vma)
1631 vma_end_read(dst_vma);
1632}
1633
1634#else
1635
1636static int uffd_move_lock(struct mm_struct *mm,
1637 unsigned long dst_start,
1638 unsigned long src_start,
1639 struct vm_area_struct **dst_vmap,
1640 struct vm_area_struct **src_vmap)
1641{
1642 int err;
1643
1644 mmap_read_lock(mm);
1645 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1646 if (err)
1647 mmap_read_unlock(mm);
1648 return err;
1649}
1650
1651static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1652 struct vm_area_struct *src_vma)
1653{
1654 mmap_assert_locked(src_vma->vm_mm);
1655 mmap_read_unlock(dst_vma->vm_mm);
1656}
1657#endif
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
1736 unsigned long src_start, unsigned long len, __u64 mode)
1737{
1738 struct mm_struct *mm = ctx->mm;
1739 struct vm_area_struct *src_vma, *dst_vma;
1740 unsigned long src_addr, dst_addr;
1741 pmd_t *src_pmd, *dst_pmd;
1742 long err = -EINVAL;
1743 ssize_t moved = 0;
1744
1745
1746 VM_WARN_ON_ONCE(src_start & ~PAGE_MASK);
1747 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
1748 VM_WARN_ON_ONCE(len & ~PAGE_MASK);
1749
1750
1751 VM_WARN_ON_ONCE(src_start + len < src_start);
1752 VM_WARN_ON_ONCE(dst_start + len < dst_start);
1753
1754 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
1755 if (err)
1756 goto out;
1757
1758
1759 err = -EAGAIN;
1760 down_read(&ctx->map_changing_lock);
1761 if (likely(atomic_read(&ctx->mmap_changing)))
1762 goto out_unlock;
1763
1764
1765
1766
1767
1768 err = -EINVAL;
1769 if (src_vma->vm_flags & VM_SHARED)
1770 goto out_unlock;
1771 if (src_start + len > src_vma->vm_end)
1772 goto out_unlock;
1773
1774 if (dst_vma->vm_flags & VM_SHARED)
1775 goto out_unlock;
1776 if (dst_start + len > dst_vma->vm_end)
1777 goto out_unlock;
1778
1779 err = validate_move_areas(ctx, src_vma, dst_vma);
1780 if (err)
1781 goto out_unlock;
1782
1783 for (src_addr = src_start, dst_addr = dst_start;
1784 src_addr < src_start + len;) {
1785 spinlock_t *ptl;
1786 pmd_t dst_pmdval;
1787 unsigned long step_size;
1788
1789
1790
1791
1792
1793
1794 src_pmd = mm_find_pmd(mm, src_addr);
1795 if (unlikely(!src_pmd)) {
1796 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1797 err = -ENOENT;
1798 break;
1799 }
1800 src_pmd = mm_alloc_pmd(mm, src_addr);
1801 if (unlikely(!src_pmd)) {
1802 err = -ENOMEM;
1803 break;
1804 }
1805 }
1806 dst_pmd = mm_alloc_pmd(mm, dst_addr);
1807 if (unlikely(!dst_pmd)) {
1808 err = -ENOMEM;
1809 break;
1810 }
1811
1812 dst_pmdval = pmdp_get_lockless(dst_pmd);
1813
1814
1815
1816
1817
1818
1819 if (unlikely(pmd_trans_huge(dst_pmdval))) {
1820 err = -EEXIST;
1821 break;
1822 }
1823
1824 ptl = pmd_trans_huge_lock(src_pmd, src_vma);
1825 if (ptl) {
1826
1827 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
1828 !pmd_none(dst_pmdval)) {
1829
1830 if (pmd_present(*src_pmd)) {
1831 struct folio *folio = pmd_folio(*src_pmd);
1832
1833 if (!is_huge_zero_folio(folio) &&
1834 !PageAnonExclusive(&folio->page)) {
1835 spin_unlock(ptl);
1836 err = -EBUSY;
1837 break;
1838 }
1839 }
1840
1841 spin_unlock(ptl);
1842 split_huge_pmd(src_vma, src_pmd, src_addr);
1843
1844 continue;
1845 }
1846
1847 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd,
1848 dst_pmdval, dst_vma, src_vma,
1849 dst_addr, src_addr);
1850 step_size = HPAGE_PMD_SIZE;
1851 } else {
1852 if (pmd_none(*src_pmd)) {
1853 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1854 err = -ENOENT;
1855 break;
1856 }
1857 if (unlikely(__pte_alloc(mm, src_pmd))) {
1858 err = -ENOMEM;
1859 break;
1860 }
1861 }
1862
1863 if (unlikely(pte_alloc(mm, dst_pmd))) {
1864 err = -ENOMEM;
1865 break;
1866 }
1867
1868 err = move_pages_pte(mm, dst_pmd, src_pmd,
1869 dst_vma, src_vma,
1870 dst_addr, src_addr, mode);
1871 step_size = PAGE_SIZE;
1872 }
1873
1874 cond_resched();
1875
1876 if (fatal_signal_pending(current)) {
1877
1878 if (!err || err == -EAGAIN)
1879 err = -EINTR;
1880 break;
1881 }
1882
1883 if (err) {
1884 if (err == -EAGAIN)
1885 continue;
1886 break;
1887 }
1888
1889
1890 dst_addr += step_size;
1891 src_addr += step_size;
1892 moved += step_size;
1893 }
1894
1895out_unlock:
1896 up_read(&ctx->map_changing_lock);
1897 uffd_move_unlock(dst_vma, src_vma);
1898out:
1899 VM_WARN_ON_ONCE(moved < 0);
1900 VM_WARN_ON_ONCE(err > 0);
1901 VM_WARN_ON_ONCE(!moved && !err);
1902 return moved ? moved : err;
1903}
1904
1905static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
1906 vm_flags_t vm_flags)
1907{
1908 const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP;
1909
1910 vm_flags_reset(vma, vm_flags);
1911
1912
1913
1914
1915
1916 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
1917 vma_set_page_prot(vma);
1918}
1919
1920static void userfaultfd_set_ctx(struct vm_area_struct *vma,
1921 struct userfaultfd_ctx *ctx,
1922 vm_flags_t vm_flags)
1923{
1924 vma_start_write(vma);
1925 vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
1926 userfaultfd_set_vm_flags(vma,
1927 (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags);
1928}
1929
1930void userfaultfd_reset_ctx(struct vm_area_struct *vma)
1931{
1932 userfaultfd_set_ctx(vma, NULL, 0);
1933}
1934
1935struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
1936 struct vm_area_struct *prev,
1937 struct vm_area_struct *vma,
1938 unsigned long start,
1939 unsigned long end)
1940{
1941 struct vm_area_struct *ret;
1942 bool give_up_on_oom = false;
1943
1944
1945
1946
1947
1948 if (start == vma->vm_start && end == vma->vm_end)
1949 give_up_on_oom = true;
1950
1951
1952 if (userfaultfd_wp(vma))
1953 uffd_wp_range(vma, start, end - start, false);
1954
1955 ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
1956 vma->vm_flags & ~__VM_UFFD_FLAGS,
1957 NULL_VM_UFFD_CTX, give_up_on_oom);
1958
1959
1960
1961
1962
1963
1964 if (!IS_ERR(ret))
1965 userfaultfd_reset_ctx(ret);
1966
1967 return ret;
1968}
1969
1970
1971int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
1972 struct vm_area_struct *vma,
1973 vm_flags_t vm_flags,
1974 unsigned long start, unsigned long end,
1975 bool wp_async)
1976{
1977 VMA_ITERATOR(vmi, ctx->mm, start);
1978 struct vm_area_struct *prev = vma_prev(&vmi);
1979 unsigned long vma_end;
1980 vm_flags_t new_flags;
1981
1982 if (vma->vm_start < start)
1983 prev = vma;
1984
1985 for_each_vma_range(vmi, vma, end) {
1986 cond_resched();
1987
1988 VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async));
1989 VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx &&
1990 vma->vm_userfaultfd_ctx.ctx != ctx);
1991 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
1992
1993
1994
1995
1996
1997 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1998 (vma->vm_flags & vm_flags) == vm_flags)
1999 goto skip;
2000
2001 if (vma->vm_start > start)
2002 start = vma->vm_start;
2003 vma_end = min(end, vma->vm_end);
2004
2005 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
2006 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
2007 new_flags,
2008 (struct vm_userfaultfd_ctx){ctx},
2009 false);
2010 if (IS_ERR(vma))
2011 return PTR_ERR(vma);
2012
2013
2014
2015
2016
2017
2018 userfaultfd_set_ctx(vma, ctx, vm_flags);
2019
2020 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
2021 hugetlb_unshare_all_pmds(vma);
2022
2023skip:
2024 prev = vma;
2025 start = vma->vm_end;
2026 }
2027
2028 return 0;
2029}
2030
2031void userfaultfd_release_new(struct userfaultfd_ctx *ctx)
2032{
2033 struct mm_struct *mm = ctx->mm;
2034 struct vm_area_struct *vma;
2035 VMA_ITERATOR(vmi, mm, 0);
2036
2037
2038 mmap_write_lock(mm);
2039 for_each_vma(vmi, vma) {
2040 if (vma->vm_userfaultfd_ctx.ctx == ctx)
2041 userfaultfd_reset_ctx(vma);
2042 }
2043 mmap_write_unlock(mm);
2044}
2045
2046void userfaultfd_release_all(struct mm_struct *mm,
2047 struct userfaultfd_ctx *ctx)
2048{
2049 struct vm_area_struct *vma, *prev;
2050 VMA_ITERATOR(vmi, mm, 0);
2051
2052 if (!mmget_not_zero(mm))
2053 return;
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063 mmap_write_lock(mm);
2064 prev = NULL;
2065 for_each_vma(vmi, vma) {
2066 cond_resched();
2067 VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^
2068 !!(vma->vm_flags & __VM_UFFD_FLAGS));
2069 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
2070 prev = vma;
2071 continue;
2072 }
2073
2074 vma = userfaultfd_clear_vma(&vmi, prev, vma,
2075 vma->vm_start, vma->vm_end);
2076 prev = vma;
2077 }
2078 mmap_write_unlock(mm);
2079 mmput(mm);
2080}
2081