1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/list.h>
16#include <linux/hashtable.h>
17#include <linux/sched/signal.h>
18#include <linux/sched/mm.h>
19#include <linux/mm.h>
20#include <linux/poll.h>
21#include <linux/slab.h>
22#include <linux/seq_file.h>
23#include <linux/file.h>
24#include <linux/bug.h>
25#include <linux/anon_inodes.h>
26#include <linux/syscalls.h>
27#include <linux/userfaultfd_k.h>
28#include <linux/mempolicy.h>
29#include <linux/ioctl.h>
30#include <linux/security.h>
31#include <linux/hugetlb.h>
32
33static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
34
35enum userfaultfd_state {
36 UFFD_STATE_WAIT_API,
37 UFFD_STATE_RUNNING,
38};
39
40
41
42
43
44struct userfaultfd_ctx {
45
46 wait_queue_head_t fault_pending_wqh;
47
48 wait_queue_head_t fault_wqh;
49
50 wait_queue_head_t fd_wqh;
51
52 wait_queue_head_t event_wqh;
53
54 struct seqcount refile_seq;
55
56 atomic_t refcount;
57
58 unsigned int flags;
59
60 unsigned int features;
61
62 enum userfaultfd_state state;
63
64 bool released;
65
66 bool mmap_changing;
67
68 struct mm_struct *mm;
69};
70
71struct userfaultfd_fork_ctx {
72 struct userfaultfd_ctx *orig;
73 struct userfaultfd_ctx *new;
74 struct list_head list;
75};
76
77struct userfaultfd_unmap_ctx {
78 struct userfaultfd_ctx *ctx;
79 unsigned long start;
80 unsigned long end;
81 struct list_head list;
82};
83
84struct userfaultfd_wait_queue {
85 struct uffd_msg msg;
86 wait_queue_entry_t wq;
87 struct userfaultfd_ctx *ctx;
88 bool waken;
89};
90
91struct userfaultfd_wake_range {
92 unsigned long start;
93 unsigned long len;
94};
95
96static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
97 int wake_flags, void *key)
98{
99 struct userfaultfd_wake_range *range = key;
100 int ret;
101 struct userfaultfd_wait_queue *uwq;
102 unsigned long start, len;
103
104 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
105 ret = 0;
106
107 start = range->start;
108 len = range->len;
109 if (len && (start > uwq->msg.arg.pagefault.address ||
110 start + len <= uwq->msg.arg.pagefault.address))
111 goto out;
112 WRITE_ONCE(uwq->waken, true);
113
114
115
116
117 ret = wake_up_state(wq->private, mode);
118 if (ret) {
119
120
121
122
123
124
125
126
127
128
129
130 list_del_init(&wq->entry);
131 }
132out:
133 return ret;
134}
135
136
137
138
139
140
141static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
142{
143 if (!atomic_inc_not_zero(&ctx->refcount))
144 BUG();
145}
146
147
148
149
150
151
152
153
154
155static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
156{
157 if (atomic_dec_and_test(&ctx->refcount)) {
158 VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
159 VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
160 VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
161 VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
162 VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
163 VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
164 VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
165 VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
166 mmdrop(ctx->mm);
167 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
168 }
169}
170
171static inline void msg_init(struct uffd_msg *msg)
172{
173 BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
174
175
176
177
178 memset(msg, 0, sizeof(struct uffd_msg));
179}
180
181static inline struct uffd_msg userfault_msg(unsigned long address,
182 unsigned int flags,
183 unsigned long reason,
184 unsigned int features)
185{
186 struct uffd_msg msg;
187 msg_init(&msg);
188 msg.event = UFFD_EVENT_PAGEFAULT;
189 msg.arg.pagefault.address = address;
190 if (flags & FAULT_FLAG_WRITE)
191
192
193
194
195
196
197
198 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
199 if (reason & VM_UFFD_WP)
200
201
202
203
204
205
206
207 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
208 if (features & UFFD_FEATURE_THREAD_ID)
209 msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
210 return msg;
211}
212
213#ifdef CONFIG_HUGETLB_PAGE
214
215
216
217
218static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
219 struct vm_area_struct *vma,
220 unsigned long address,
221 unsigned long flags,
222 unsigned long reason)
223{
224 struct mm_struct *mm = ctx->mm;
225 pte_t *ptep, pte;
226 bool ret = true;
227
228 mmap_assert_locked(mm);
229
230 ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
231
232 if (!ptep)
233 goto out;
234
235 ret = false;
236 pte = huge_ptep_get(ptep);
237
238
239
240
241
242 if (huge_pte_none(pte))
243 ret = true;
244 if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
245 ret = true;
246out:
247 return ret;
248}
249#else
250static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
251 struct vm_area_struct *vma,
252 unsigned long address,
253 unsigned long flags,
254 unsigned long reason)
255{
256 return false;
257}
258#endif
259
260
261
262
263
264
265
266
267static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
268 unsigned long address,
269 unsigned long flags,
270 unsigned long reason)
271{
272 struct mm_struct *mm = ctx->mm;
273 pgd_t *pgd;
274 p4d_t *p4d;
275 pud_t *pud;
276 pmd_t *pmd, _pmd;
277 pte_t *pte;
278 bool ret = true;
279
280 mmap_assert_locked(mm);
281
282 pgd = pgd_offset(mm, address);
283 if (!pgd_present(*pgd))
284 goto out;
285 p4d = p4d_offset(pgd, address);
286 if (!p4d_present(*p4d))
287 goto out;
288 pud = pud_offset(p4d, address);
289 if (!pud_present(*pud))
290 goto out;
291 pmd = pmd_offset(pud, address);
292
293
294
295
296
297
298
299
300 _pmd = READ_ONCE(*pmd);
301 if (pmd_none(_pmd))
302 goto out;
303
304 ret = false;
305 if (!pmd_present(_pmd))
306 goto out;
307
308 if (pmd_trans_huge(_pmd)) {
309 if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
310 ret = true;
311 goto out;
312 }
313
314
315
316
317
318 pte = pte_offset_map(pmd, address);
319
320
321
322
323 if (pte_none(*pte))
324 ret = true;
325 if (!pte_write(*pte) && (reason & VM_UFFD_WP))
326 ret = true;
327 pte_unmap(pte);
328
329out:
330 return ret;
331}
332
333
334static inline long userfaultfd_get_blocking_state(unsigned int flags)
335{
336 if (flags & FAULT_FLAG_INTERRUPTIBLE)
337 return TASK_INTERRUPTIBLE;
338
339 if (flags & FAULT_FLAG_KILLABLE)
340 return TASK_KILLABLE;
341
342 return TASK_UNINTERRUPTIBLE;
343}
344
345
346static inline bool userfaultfd_signal_pending(unsigned int flags)
347{
348 if (flags & FAULT_FLAG_INTERRUPTIBLE)
349 return signal_pending(current);
350
351 if (flags & FAULT_FLAG_KILLABLE)
352 return fatal_signal_pending(current);
353
354 return false;
355}
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
373{
374 struct mm_struct *mm = vmf->vma->vm_mm;
375 struct userfaultfd_ctx *ctx;
376 struct userfaultfd_wait_queue uwq;
377 vm_fault_t ret = VM_FAULT_SIGBUS;
378 bool must_wait;
379 long blocking_state;
380
381
382
383
384
385
386
387
388
389
390
391
392 if (current->flags & (PF_EXITING|PF_DUMPCORE))
393 goto out;
394
395
396
397
398
399 mmap_assert_locked(mm);
400
401 ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
402 if (!ctx)
403 goto out;
404
405 BUG_ON(ctx->mm != mm);
406
407 VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
408 VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
409
410 if (ctx->features & UFFD_FEATURE_SIGBUS)
411 goto out;
412
413
414
415
416
417
418 if (unlikely(READ_ONCE(ctx->released))) {
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435 ret = VM_FAULT_NOPAGE;
436 goto out;
437 }
438
439
440
441
442
443
444
445
446
447
448
449 if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
450
451
452
453
454
455 BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
456#ifdef CONFIG_DEBUG_VM
457 if (printk_ratelimit()) {
458 printk(KERN_WARNING
459 "FAULT_FLAG_ALLOW_RETRY missing %x\n",
460 vmf->flags);
461 dump_stack();
462 }
463#endif
464 goto out;
465 }
466
467
468
469
470
471 ret = VM_FAULT_RETRY;
472 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
473 goto out;
474
475
476 userfaultfd_ctx_get(ctx);
477
478 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
479 uwq.wq.private = current;
480 uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
481 ctx->features);
482 uwq.ctx = ctx;
483 uwq.waken = false;
484
485 blocking_state = userfaultfd_get_blocking_state(vmf->flags);
486
487 spin_lock(&ctx->fault_pending_wqh.lock);
488
489
490
491
492 __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
493
494
495
496
497
498 set_current_state(blocking_state);
499 spin_unlock(&ctx->fault_pending_wqh.lock);
500
501 if (!is_vm_hugetlb_page(vmf->vma))
502 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
503 reason);
504 else
505 must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
506 vmf->address,
507 vmf->flags, reason);
508 mmap_read_unlock(mm);
509
510 if (likely(must_wait && !READ_ONCE(ctx->released) &&
511 !userfaultfd_signal_pending(vmf->flags))) {
512 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
513 schedule();
514 ret |= VM_FAULT_MAJOR;
515
516
517
518
519
520
521
522
523 while (!READ_ONCE(uwq.waken)) {
524
525
526
527
528
529
530 set_current_state(blocking_state);
531 if (READ_ONCE(uwq.waken) ||
532 READ_ONCE(ctx->released) ||
533 userfaultfd_signal_pending(vmf->flags))
534 break;
535 schedule();
536 }
537 }
538
539 __set_current_state(TASK_RUNNING);
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554 if (!list_empty_careful(&uwq.wq.entry)) {
555 spin_lock(&ctx->fault_pending_wqh.lock);
556
557
558
559
560 list_del(&uwq.wq.entry);
561 spin_unlock(&ctx->fault_pending_wqh.lock);
562 }
563
564
565
566
567
568 userfaultfd_ctx_put(ctx);
569
570out:
571 return ret;
572}
573
574static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
575 struct userfaultfd_wait_queue *ewq)
576{
577 struct userfaultfd_ctx *release_new_ctx;
578
579 if (WARN_ON_ONCE(current->flags & PF_EXITING))
580 goto out;
581
582 ewq->ctx = ctx;
583 init_waitqueue_entry(&ewq->wq, current);
584 release_new_ctx = NULL;
585
586 spin_lock(&ctx->event_wqh.lock);
587
588
589
590
591 __add_wait_queue(&ctx->event_wqh, &ewq->wq);
592 for (;;) {
593 set_current_state(TASK_KILLABLE);
594 if (ewq->msg.event == 0)
595 break;
596 if (READ_ONCE(ctx->released) ||
597 fatal_signal_pending(current)) {
598
599
600
601
602
603
604 __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
605 if (ewq->msg.event == UFFD_EVENT_FORK) {
606 struct userfaultfd_ctx *new;
607
608 new = (struct userfaultfd_ctx *)
609 (unsigned long)
610 ewq->msg.arg.reserved.reserved1;
611 release_new_ctx = new;
612 }
613 break;
614 }
615
616 spin_unlock(&ctx->event_wqh.lock);
617
618 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
619 schedule();
620
621 spin_lock(&ctx->event_wqh.lock);
622 }
623 __set_current_state(TASK_RUNNING);
624 spin_unlock(&ctx->event_wqh.lock);
625
626 if (release_new_ctx) {
627 struct vm_area_struct *vma;
628 struct mm_struct *mm = release_new_ctx->mm;
629
630
631 mmap_write_lock(mm);
632
633 VM_WARN_ON(!mmget_still_valid(mm));
634 for (vma = mm->mmap; vma; vma = vma->vm_next)
635 if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
636 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
637 vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
638 }
639 mmap_write_unlock(mm);
640
641 userfaultfd_ctx_put(release_new_ctx);
642 }
643
644
645
646
647
648out:
649 WRITE_ONCE(ctx->mmap_changing, false);
650 userfaultfd_ctx_put(ctx);
651}
652
653static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
654 struct userfaultfd_wait_queue *ewq)
655{
656 ewq->msg.event = 0;
657 wake_up_locked(&ctx->event_wqh);
658 __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
659}
660
661int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
662{
663 struct userfaultfd_ctx *ctx = NULL, *octx;
664 struct userfaultfd_fork_ctx *fctx;
665
666 octx = vma->vm_userfaultfd_ctx.ctx;
667 if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
668 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
669 vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
670 return 0;
671 }
672
673 list_for_each_entry(fctx, fcs, list)
674 if (fctx->orig == octx) {
675 ctx = fctx->new;
676 break;
677 }
678
679 if (!ctx) {
680 fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
681 if (!fctx)
682 return -ENOMEM;
683
684 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
685 if (!ctx) {
686 kfree(fctx);
687 return -ENOMEM;
688 }
689
690 atomic_set(&ctx->refcount, 1);
691 ctx->flags = octx->flags;
692 ctx->state = UFFD_STATE_RUNNING;
693 ctx->features = octx->features;
694 ctx->released = false;
695 ctx->mmap_changing = false;
696 ctx->mm = vma->vm_mm;
697 mmgrab(ctx->mm);
698
699 userfaultfd_ctx_get(octx);
700 WRITE_ONCE(octx->mmap_changing, true);
701 fctx->orig = octx;
702 fctx->new = ctx;
703 list_add_tail(&fctx->list, fcs);
704 }
705
706 vma->vm_userfaultfd_ctx.ctx = ctx;
707 return 0;
708}
709
710static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
711{
712 struct userfaultfd_ctx *ctx = fctx->orig;
713 struct userfaultfd_wait_queue ewq;
714
715 msg_init(&ewq.msg);
716
717 ewq.msg.event = UFFD_EVENT_FORK;
718 ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
719
720 userfaultfd_event_wait_completion(ctx, &ewq);
721}
722
723void dup_userfaultfd_complete(struct list_head *fcs)
724{
725 struct userfaultfd_fork_ctx *fctx, *n;
726
727 list_for_each_entry_safe(fctx, n, fcs, list) {
728 dup_fctx(fctx);
729 list_del(&fctx->list);
730 kfree(fctx);
731 }
732}
733
734void mremap_userfaultfd_prep(struct vm_area_struct *vma,
735 struct vm_userfaultfd_ctx *vm_ctx)
736{
737 struct userfaultfd_ctx *ctx;
738
739 ctx = vma->vm_userfaultfd_ctx.ctx;
740
741 if (!ctx)
742 return;
743
744 if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
745 vm_ctx->ctx = ctx;
746 userfaultfd_ctx_get(ctx);
747 WRITE_ONCE(ctx->mmap_changing, true);
748 } else {
749
750 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
751 vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
752 }
753}
754
755void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
756 unsigned long from, unsigned long to,
757 unsigned long len)
758{
759 struct userfaultfd_ctx *ctx = vm_ctx->ctx;
760 struct userfaultfd_wait_queue ewq;
761
762 if (!ctx)
763 return;
764
765 if (to & ~PAGE_MASK) {
766 userfaultfd_ctx_put(ctx);
767 return;
768 }
769
770 msg_init(&ewq.msg);
771
772 ewq.msg.event = UFFD_EVENT_REMAP;
773 ewq.msg.arg.remap.from = from;
774 ewq.msg.arg.remap.to = to;
775 ewq.msg.arg.remap.len = len;
776
777 userfaultfd_event_wait_completion(ctx, &ewq);
778}
779
780bool userfaultfd_remove(struct vm_area_struct *vma,
781 unsigned long start, unsigned long end)
782{
783 struct mm_struct *mm = vma->vm_mm;
784 struct userfaultfd_ctx *ctx;
785 struct userfaultfd_wait_queue ewq;
786
787 ctx = vma->vm_userfaultfd_ctx.ctx;
788 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
789 return true;
790
791 userfaultfd_ctx_get(ctx);
792 WRITE_ONCE(ctx->mmap_changing, true);
793 mmap_read_unlock(mm);
794
795 msg_init(&ewq.msg);
796
797 ewq.msg.event = UFFD_EVENT_REMOVE;
798 ewq.msg.arg.remove.start = start;
799 ewq.msg.arg.remove.end = end;
800
801 userfaultfd_event_wait_completion(ctx, &ewq);
802
803 return false;
804}
805
806static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
807 unsigned long start, unsigned long end)
808{
809 struct userfaultfd_unmap_ctx *unmap_ctx;
810
811 list_for_each_entry(unmap_ctx, unmaps, list)
812 if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
813 unmap_ctx->end == end)
814 return true;
815
816 return false;
817}
818
819int userfaultfd_unmap_prep(struct vm_area_struct *vma,
820 unsigned long start, unsigned long end,
821 struct list_head *unmaps)
822{
823 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
824 struct userfaultfd_unmap_ctx *unmap_ctx;
825 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
826
827 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
828 has_unmap_ctx(ctx, unmaps, start, end))
829 continue;
830
831 unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
832 if (!unmap_ctx)
833 return -ENOMEM;
834
835 userfaultfd_ctx_get(ctx);
836 WRITE_ONCE(ctx->mmap_changing, true);
837 unmap_ctx->ctx = ctx;
838 unmap_ctx->start = start;
839 unmap_ctx->end = end;
840 list_add_tail(&unmap_ctx->list, unmaps);
841 }
842
843 return 0;
844}
845
846void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
847{
848 struct userfaultfd_unmap_ctx *ctx, *n;
849 struct userfaultfd_wait_queue ewq;
850
851 list_for_each_entry_safe(ctx, n, uf, list) {
852 msg_init(&ewq.msg);
853
854 ewq.msg.event = UFFD_EVENT_UNMAP;
855 ewq.msg.arg.remove.start = ctx->start;
856 ewq.msg.arg.remove.end = ctx->end;
857
858 userfaultfd_event_wait_completion(ctx->ctx, &ewq);
859
860 list_del(&ctx->list);
861 kfree(ctx);
862 }
863}
864
865static int userfaultfd_release(struct inode *inode, struct file *file)
866{
867 struct userfaultfd_ctx *ctx = file->private_data;
868 struct mm_struct *mm = ctx->mm;
869 struct vm_area_struct *vma, *prev;
870
871 struct userfaultfd_wake_range range = { .len = 0, };
872 unsigned long new_flags;
873 bool still_valid;
874
875 WRITE_ONCE(ctx->released, true);
876
877 if (!mmget_not_zero(mm))
878 goto wakeup;
879
880
881
882
883
884
885
886
887
888 mmap_write_lock(mm);
889 still_valid = mmget_still_valid(mm);
890 prev = NULL;
891 for (vma = mm->mmap; vma; vma = vma->vm_next) {
892 cond_resched();
893 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
894 !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
895 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
896 prev = vma;
897 continue;
898 }
899 new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
900 if (still_valid) {
901 prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
902 new_flags, vma->anon_vma,
903 vma->vm_file, vma->vm_pgoff,
904 vma_policy(vma),
905 NULL_VM_UFFD_CTX);
906 if (prev)
907 vma = prev;
908 else
909 prev = vma;
910 }
911 vma->vm_flags = new_flags;
912 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
913 }
914 mmap_write_unlock(mm);
915 mmput(mm);
916wakeup:
917
918
919
920
921
922 spin_lock(&ctx->fault_pending_wqh.lock);
923 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
924 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
925 spin_unlock(&ctx->fault_pending_wqh.lock);
926
927
928 wake_up_all(&ctx->event_wqh);
929
930 wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
931 userfaultfd_ctx_put(ctx);
932 return 0;
933}
934
935
936static inline struct userfaultfd_wait_queue *find_userfault_in(
937 wait_queue_head_t *wqh)
938{
939 wait_queue_entry_t *wq;
940 struct userfaultfd_wait_queue *uwq;
941
942 lockdep_assert_held(&wqh->lock);
943
944 uwq = NULL;
945 if (!waitqueue_active(wqh))
946 goto out;
947
948 wq = list_last_entry(&wqh->head, typeof(*wq), entry);
949 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
950out:
951 return uwq;
952}
953
954static inline struct userfaultfd_wait_queue *find_userfault(
955 struct userfaultfd_ctx *ctx)
956{
957 return find_userfault_in(&ctx->fault_pending_wqh);
958}
959
960static inline struct userfaultfd_wait_queue *find_userfault_evt(
961 struct userfaultfd_ctx *ctx)
962{
963 return find_userfault_in(&ctx->event_wqh);
964}
965
966static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
967{
968 struct userfaultfd_ctx *ctx = file->private_data;
969 __poll_t ret;
970
971 poll_wait(file, &ctx->fd_wqh, wait);
972
973 switch (ctx->state) {
974 case UFFD_STATE_WAIT_API:
975 return EPOLLERR;
976 case UFFD_STATE_RUNNING:
977
978
979
980
981 if (unlikely(!(file->f_flags & O_NONBLOCK)))
982 return EPOLLERR;
983
984
985
986
987
988
989
990
991
992
993 ret = 0;
994 smp_mb();
995 if (waitqueue_active(&ctx->fault_pending_wqh))
996 ret = EPOLLIN;
997 else if (waitqueue_active(&ctx->event_wqh))
998 ret = EPOLLIN;
999
1000 return ret;
1001 default:
1002 WARN_ON_ONCE(1);
1003 return EPOLLERR;
1004 }
1005}
1006
1007static const struct file_operations userfaultfd_fops;
1008
1009static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
1010 struct userfaultfd_ctx *new,
1011 struct uffd_msg *msg)
1012{
1013 int fd;
1014
1015 fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
1016 O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
1017 if (fd < 0)
1018 return fd;
1019
1020 msg->arg.reserved.reserved1 = 0;
1021 msg->arg.fork.ufd = fd;
1022 return 0;
1023}
1024
1025static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1026 struct uffd_msg *msg)
1027{
1028 ssize_t ret;
1029 DECLARE_WAITQUEUE(wait, current);
1030 struct userfaultfd_wait_queue *uwq;
1031
1032
1033
1034
1035
1036
1037
1038 LIST_HEAD(fork_event);
1039 struct userfaultfd_ctx *fork_nctx = NULL;
1040
1041
1042 spin_lock_irq(&ctx->fd_wqh.lock);
1043 __add_wait_queue(&ctx->fd_wqh, &wait);
1044 for (;;) {
1045 set_current_state(TASK_INTERRUPTIBLE);
1046 spin_lock(&ctx->fault_pending_wqh.lock);
1047 uwq = find_userfault(ctx);
1048 if (uwq) {
1049
1050
1051
1052
1053
1054
1055
1056 write_seqcount_begin(&ctx->refile_seq);
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079 list_del(&uwq->wq.entry);
1080 __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
1081
1082 write_seqcount_end(&ctx->refile_seq);
1083
1084
1085 *msg = uwq->msg;
1086 spin_unlock(&ctx->fault_pending_wqh.lock);
1087 ret = 0;
1088 break;
1089 }
1090 spin_unlock(&ctx->fault_pending_wqh.lock);
1091
1092 spin_lock(&ctx->event_wqh.lock);
1093 uwq = find_userfault_evt(ctx);
1094 if (uwq) {
1095 *msg = uwq->msg;
1096
1097 if (uwq->msg.event == UFFD_EVENT_FORK) {
1098 fork_nctx = (struct userfaultfd_ctx *)
1099 (unsigned long)
1100 uwq->msg.arg.reserved.reserved1;
1101 list_move(&uwq->wq.entry, &fork_event);
1102
1103
1104
1105
1106
1107 userfaultfd_ctx_get(fork_nctx);
1108 spin_unlock(&ctx->event_wqh.lock);
1109 ret = 0;
1110 break;
1111 }
1112
1113 userfaultfd_event_complete(ctx, uwq);
1114 spin_unlock(&ctx->event_wqh.lock);
1115 ret = 0;
1116 break;
1117 }
1118 spin_unlock(&ctx->event_wqh.lock);
1119
1120 if (signal_pending(current)) {
1121 ret = -ERESTARTSYS;
1122 break;
1123 }
1124 if (no_wait) {
1125 ret = -EAGAIN;
1126 break;
1127 }
1128 spin_unlock_irq(&ctx->fd_wqh.lock);
1129 schedule();
1130 spin_lock_irq(&ctx->fd_wqh.lock);
1131 }
1132 __remove_wait_queue(&ctx->fd_wqh, &wait);
1133 __set_current_state(TASK_RUNNING);
1134 spin_unlock_irq(&ctx->fd_wqh.lock);
1135
1136 if (!ret && msg->event == UFFD_EVENT_FORK) {
1137 ret = resolve_userfault_fork(ctx, fork_nctx, msg);
1138 spin_lock(&ctx->event_wqh.lock);
1139 if (!list_empty(&fork_event)) {
1140
1141
1142
1143
1144 userfaultfd_ctx_put(fork_nctx);
1145
1146 uwq = list_first_entry(&fork_event,
1147 typeof(*uwq),
1148 wq.entry);
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159 list_del(&uwq->wq.entry);
1160 __add_wait_queue(&ctx->event_wqh, &uwq->wq);
1161
1162
1163
1164
1165
1166
1167 if (likely(!ret))
1168 userfaultfd_event_complete(ctx, uwq);
1169 } else {
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181 if (ret)
1182 userfaultfd_ctx_put(fork_nctx);
1183 }
1184 spin_unlock(&ctx->event_wqh.lock);
1185 }
1186
1187 return ret;
1188}
1189
1190static ssize_t userfaultfd_read(struct file *file, char __user *buf,
1191 size_t count, loff_t *ppos)
1192{
1193 struct userfaultfd_ctx *ctx = file->private_data;
1194 ssize_t _ret, ret = 0;
1195 struct uffd_msg msg;
1196 int no_wait = file->f_flags & O_NONBLOCK;
1197
1198 if (ctx->state == UFFD_STATE_WAIT_API)
1199 return -EINVAL;
1200
1201 for (;;) {
1202 if (count < sizeof(msg))
1203 return ret ? ret : -EINVAL;
1204 _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
1205 if (_ret < 0)
1206 return ret ? ret : _ret;
1207 if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
1208 return ret ? ret : -EFAULT;
1209 ret += sizeof(msg);
1210 buf += sizeof(msg);
1211 count -= sizeof(msg);
1212
1213
1214
1215
1216 no_wait = O_NONBLOCK;
1217 }
1218}
1219
1220static void __wake_userfault(struct userfaultfd_ctx *ctx,
1221 struct userfaultfd_wake_range *range)
1222{
1223 spin_lock(&ctx->fault_pending_wqh.lock);
1224
1225 if (waitqueue_active(&ctx->fault_pending_wqh))
1226 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
1227 range);
1228 if (waitqueue_active(&ctx->fault_wqh))
1229 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range);
1230 spin_unlock(&ctx->fault_pending_wqh.lock);
1231}
1232
1233static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1234 struct userfaultfd_wake_range *range)
1235{
1236 unsigned seq;
1237 bool need_wakeup;
1238
1239
1240
1241
1242
1243
1244
1245
1246 smp_mb();
1247
1248
1249
1250
1251
1252
1253
1254 do {
1255 seq = read_seqcount_begin(&ctx->refile_seq);
1256 need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
1257 waitqueue_active(&ctx->fault_wqh);
1258 cond_resched();
1259 } while (read_seqcount_retry(&ctx->refile_seq, seq));
1260 if (need_wakeup)
1261 __wake_userfault(ctx, range);
1262}
1263
1264static __always_inline int validate_range(struct mm_struct *mm,
1265 __u64 *start, __u64 len)
1266{
1267 __u64 task_size = mm->task_size;
1268
1269 *start = untagged_addr(*start);
1270
1271 if (*start & ~PAGE_MASK)
1272 return -EINVAL;
1273 if (len & ~PAGE_MASK)
1274 return -EINVAL;
1275 if (!len)
1276 return -EINVAL;
1277 if (*start < mmap_min_addr)
1278 return -EINVAL;
1279 if (*start >= task_size)
1280 return -EINVAL;
1281 if (len > task_size - *start)
1282 return -EINVAL;
1283 return 0;
1284}
1285
1286static inline bool vma_can_userfault(struct vm_area_struct *vma,
1287 unsigned long vm_flags)
1288{
1289
1290 return vma_is_anonymous(vma) ||
1291 ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
1292 !(vm_flags & VM_UFFD_WP));
1293}
1294
1295static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1296 unsigned long arg)
1297{
1298 struct mm_struct *mm = ctx->mm;
1299 struct vm_area_struct *vma, *prev, *cur;
1300 int ret;
1301 struct uffdio_register uffdio_register;
1302 struct uffdio_register __user *user_uffdio_register;
1303 unsigned long vm_flags, new_flags;
1304 bool found;
1305 bool basic_ioctls;
1306 unsigned long start, end, vma_end;
1307
1308 user_uffdio_register = (struct uffdio_register __user *) arg;
1309
1310 ret = -EFAULT;
1311 if (copy_from_user(&uffdio_register, user_uffdio_register,
1312 sizeof(uffdio_register)-sizeof(__u64)))
1313 goto out;
1314
1315 ret = -EINVAL;
1316 if (!uffdio_register.mode)
1317 goto out;
1318 if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
1319 UFFDIO_REGISTER_MODE_WP))
1320 goto out;
1321 vm_flags = 0;
1322 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1323 vm_flags |= VM_UFFD_MISSING;
1324 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
1325 vm_flags |= VM_UFFD_WP;
1326
1327 ret = validate_range(mm, &uffdio_register.range.start,
1328 uffdio_register.range.len);
1329 if (ret)
1330 goto out;
1331
1332 start = uffdio_register.range.start;
1333 end = start + uffdio_register.range.len;
1334
1335 ret = -ENOMEM;
1336 if (!mmget_not_zero(mm))
1337 goto out;
1338
1339 mmap_write_lock(mm);
1340 if (!mmget_still_valid(mm))
1341 goto out_unlock;
1342 vma = find_vma_prev(mm, start, &prev);
1343 if (!vma)
1344 goto out_unlock;
1345
1346
1347 ret = -EINVAL;
1348 if (vma->vm_start >= end)
1349 goto out_unlock;
1350
1351
1352
1353
1354
1355 if (is_vm_hugetlb_page(vma)) {
1356 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1357
1358 if (start & (vma_hpagesize - 1))
1359 goto out_unlock;
1360 }
1361
1362
1363
1364
1365 found = false;
1366 basic_ioctls = false;
1367 for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
1368 cond_resched();
1369
1370 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1371 !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
1372
1373
1374 ret = -EINVAL;
1375 if (!vma_can_userfault(cur, vm_flags))
1376 goto out_unlock;
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386 ret = -EPERM;
1387 if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1388 goto out_unlock;
1389
1390
1391
1392
1393
1394 if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
1395 end > cur->vm_start) {
1396 unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
1397
1398 ret = -EINVAL;
1399
1400 if (end & (vma_hpagesize - 1))
1401 goto out_unlock;
1402 }
1403 if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1404 goto out_unlock;
1405
1406
1407
1408
1409
1410
1411
1412 ret = -EBUSY;
1413 if (cur->vm_userfaultfd_ctx.ctx &&
1414 cur->vm_userfaultfd_ctx.ctx != ctx)
1415 goto out_unlock;
1416
1417
1418
1419
1420 if (is_vm_hugetlb_page(cur))
1421 basic_ioctls = true;
1422
1423 found = true;
1424 }
1425 BUG_ON(!found);
1426
1427 if (vma->vm_start < start)
1428 prev = vma;
1429
1430 ret = 0;
1431 do {
1432 cond_resched();
1433
1434 BUG_ON(!vma_can_userfault(vma, vm_flags));
1435 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1436 vma->vm_userfaultfd_ctx.ctx != ctx);
1437 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1438
1439
1440
1441
1442
1443 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1444 (vma->vm_flags & vm_flags) == vm_flags)
1445 goto skip;
1446
1447 if (vma->vm_start > start)
1448 start = vma->vm_start;
1449 vma_end = min(end, vma->vm_end);
1450
1451 new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
1452 prev = vma_merge(mm, prev, start, vma_end, new_flags,
1453 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
1454 vma_policy(vma),
1455 ((struct vm_userfaultfd_ctx){ ctx }));
1456 if (prev) {
1457 vma = prev;
1458 goto next;
1459 }
1460 if (vma->vm_start < start) {
1461 ret = split_vma(mm, vma, start, 1);
1462 if (ret)
1463 break;
1464 }
1465 if (vma->vm_end > end) {
1466 ret = split_vma(mm, vma, end, 0);
1467 if (ret)
1468 break;
1469 }
1470 next:
1471
1472
1473
1474
1475
1476 vma->vm_flags = new_flags;
1477 vma->vm_userfaultfd_ctx.ctx = ctx;
1478
1479 skip:
1480 prev = vma;
1481 start = vma->vm_end;
1482 vma = vma->vm_next;
1483 } while (vma && vma->vm_start < end);
1484out_unlock:
1485 mmap_write_unlock(mm);
1486 mmput(mm);
1487 if (!ret) {
1488 __u64 ioctls_out;
1489
1490 ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1491 UFFD_API_RANGE_IOCTLS;
1492
1493
1494
1495
1496
1497 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1498 ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1499
1500
1501
1502
1503
1504
1505 if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1506 ret = -EFAULT;
1507 }
1508out:
1509 return ret;
1510}
1511
1512static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1513 unsigned long arg)
1514{
1515 struct mm_struct *mm = ctx->mm;
1516 struct vm_area_struct *vma, *prev, *cur;
1517 int ret;
1518 struct uffdio_range uffdio_unregister;
1519 unsigned long new_flags;
1520 bool found;
1521 unsigned long start, end, vma_end;
1522 const void __user *buf = (void __user *)arg;
1523
1524 ret = -EFAULT;
1525 if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
1526 goto out;
1527
1528 ret = validate_range(mm, &uffdio_unregister.start,
1529 uffdio_unregister.len);
1530 if (ret)
1531 goto out;
1532
1533 start = uffdio_unregister.start;
1534 end = start + uffdio_unregister.len;
1535
1536 ret = -ENOMEM;
1537 if (!mmget_not_zero(mm))
1538 goto out;
1539
1540 mmap_write_lock(mm);
1541 if (!mmget_still_valid(mm))
1542 goto out_unlock;
1543 vma = find_vma_prev(mm, start, &prev);
1544 if (!vma)
1545 goto out_unlock;
1546
1547
1548 ret = -EINVAL;
1549 if (vma->vm_start >= end)
1550 goto out_unlock;
1551
1552
1553
1554
1555
1556 if (is_vm_hugetlb_page(vma)) {
1557 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1558
1559 if (start & (vma_hpagesize - 1))
1560 goto out_unlock;
1561 }
1562
1563
1564
1565
1566 found = false;
1567 ret = -EINVAL;
1568 for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
1569 cond_resched();
1570
1571 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1572 !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
1573
1574
1575
1576
1577
1578
1579
1580
1581 if (!vma_can_userfault(cur, cur->vm_flags))
1582 goto out_unlock;
1583
1584 found = true;
1585 }
1586 BUG_ON(!found);
1587
1588 if (vma->vm_start < start)
1589 prev = vma;
1590
1591 ret = 0;
1592 do {
1593 cond_resched();
1594
1595 BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
1596
1597
1598
1599
1600
1601 if (!vma->vm_userfaultfd_ctx.ctx)
1602 goto skip;
1603
1604 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1605
1606 if (vma->vm_start > start)
1607 start = vma->vm_start;
1608 vma_end = min(end, vma->vm_end);
1609
1610 if (userfaultfd_missing(vma)) {
1611
1612
1613
1614
1615
1616
1617 struct userfaultfd_wake_range range;
1618 range.start = start;
1619 range.len = vma_end - start;
1620 wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
1621 }
1622
1623 new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
1624 prev = vma_merge(mm, prev, start, vma_end, new_flags,
1625 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
1626 vma_policy(vma),
1627 NULL_VM_UFFD_CTX);
1628 if (prev) {
1629 vma = prev;
1630 goto next;
1631 }
1632 if (vma->vm_start < start) {
1633 ret = split_vma(mm, vma, start, 1);
1634 if (ret)
1635 break;
1636 }
1637 if (vma->vm_end > end) {
1638 ret = split_vma(mm, vma, end, 0);
1639 if (ret)
1640 break;
1641 }
1642 next:
1643
1644
1645
1646
1647
1648 vma->vm_flags = new_flags;
1649 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1650
1651 skip:
1652 prev = vma;
1653 start = vma->vm_end;
1654 vma = vma->vm_next;
1655 } while (vma && vma->vm_start < end);
1656out_unlock:
1657 mmap_write_unlock(mm);
1658 mmput(mm);
1659out:
1660 return ret;
1661}
1662
1663
1664
1665
1666
1667static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1668 unsigned long arg)
1669{
1670 int ret;
1671 struct uffdio_range uffdio_wake;
1672 struct userfaultfd_wake_range range;
1673 const void __user *buf = (void __user *)arg;
1674
1675 ret = -EFAULT;
1676 if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1677 goto out;
1678
1679 ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
1680 if (ret)
1681 goto out;
1682
1683 range.start = uffdio_wake.start;
1684 range.len = uffdio_wake.len;
1685
1686
1687
1688
1689
1690 VM_BUG_ON(!range.len);
1691
1692 wake_userfault(ctx, &range);
1693 ret = 0;
1694
1695out:
1696 return ret;
1697}
1698
1699static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1700 unsigned long arg)
1701{
1702 __s64 ret;
1703 struct uffdio_copy uffdio_copy;
1704 struct uffdio_copy __user *user_uffdio_copy;
1705 struct userfaultfd_wake_range range;
1706
1707 user_uffdio_copy = (struct uffdio_copy __user *) arg;
1708
1709 ret = -EAGAIN;
1710 if (READ_ONCE(ctx->mmap_changing))
1711 goto out;
1712
1713 ret = -EFAULT;
1714 if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1715
1716 sizeof(uffdio_copy)-sizeof(__s64)))
1717 goto out;
1718
1719 ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
1720 if (ret)
1721 goto out;
1722
1723
1724
1725
1726
1727 ret = -EINVAL;
1728 if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
1729 goto out;
1730 if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
1731 goto out;
1732 if (mmget_not_zero(ctx->mm)) {
1733 ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1734 uffdio_copy.len, &ctx->mmap_changing,
1735 uffdio_copy.mode);
1736 mmput(ctx->mm);
1737 } else {
1738 return -ESRCH;
1739 }
1740 if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1741 return -EFAULT;
1742 if (ret < 0)
1743 goto out;
1744 BUG_ON(!ret);
1745
1746 range.len = ret;
1747 if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1748 range.start = uffdio_copy.dst;
1749 wake_userfault(ctx, &range);
1750 }
1751 ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1752out:
1753 return ret;
1754}
1755
1756static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1757 unsigned long arg)
1758{
1759 __s64 ret;
1760 struct uffdio_zeropage uffdio_zeropage;
1761 struct uffdio_zeropage __user *user_uffdio_zeropage;
1762 struct userfaultfd_wake_range range;
1763
1764 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1765
1766 ret = -EAGAIN;
1767 if (READ_ONCE(ctx->mmap_changing))
1768 goto out;
1769
1770 ret = -EFAULT;
1771 if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1772
1773 sizeof(uffdio_zeropage)-sizeof(__s64)))
1774 goto out;
1775
1776 ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
1777 uffdio_zeropage.range.len);
1778 if (ret)
1779 goto out;
1780 ret = -EINVAL;
1781 if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1782 goto out;
1783
1784 if (mmget_not_zero(ctx->mm)) {
1785 ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
1786 uffdio_zeropage.range.len,
1787 &ctx->mmap_changing);
1788 mmput(ctx->mm);
1789 } else {
1790 return -ESRCH;
1791 }
1792 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1793 return -EFAULT;
1794 if (ret < 0)
1795 goto out;
1796
1797 BUG_ON(!ret);
1798 range.len = ret;
1799 if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1800 range.start = uffdio_zeropage.range.start;
1801 wake_userfault(ctx, &range);
1802 }
1803 ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1804out:
1805 return ret;
1806}
1807
1808static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1809 unsigned long arg)
1810{
1811 int ret;
1812 struct uffdio_writeprotect uffdio_wp;
1813 struct uffdio_writeprotect __user *user_uffdio_wp;
1814 struct userfaultfd_wake_range range;
1815 bool mode_wp, mode_dontwake;
1816
1817 if (READ_ONCE(ctx->mmap_changing))
1818 return -EAGAIN;
1819
1820 user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1821
1822 if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1823 sizeof(struct uffdio_writeprotect)))
1824 return -EFAULT;
1825
1826 ret = validate_range(ctx->mm, &uffdio_wp.range.start,
1827 uffdio_wp.range.len);
1828 if (ret)
1829 return ret;
1830
1831 if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1832 UFFDIO_WRITEPROTECT_MODE_WP))
1833 return -EINVAL;
1834
1835 mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1836 mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1837
1838 if (mode_wp && mode_dontwake)
1839 return -EINVAL;
1840
1841 ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
1842 uffdio_wp.range.len, mode_wp,
1843 &ctx->mmap_changing);
1844 if (ret)
1845 return ret;
1846
1847 if (!mode_wp && !mode_dontwake) {
1848 range.start = uffdio_wp.range.start;
1849 range.len = uffdio_wp.range.len;
1850 wake_userfault(ctx, &range);
1851 }
1852 return ret;
1853}
1854
1855static inline unsigned int uffd_ctx_features(__u64 user_features)
1856{
1857
1858
1859
1860 return (unsigned int)user_features;
1861}
1862
1863
1864
1865
1866
1867
1868static int userfaultfd_api(struct userfaultfd_ctx *ctx,
1869 unsigned long arg)
1870{
1871 struct uffdio_api uffdio_api;
1872 void __user *buf = (void __user *)arg;
1873 int ret;
1874 __u64 features;
1875
1876 ret = -EINVAL;
1877 if (ctx->state != UFFD_STATE_WAIT_API)
1878 goto out;
1879 ret = -EFAULT;
1880 if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
1881 goto out;
1882 features = uffdio_api.features;
1883 if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
1884 memset(&uffdio_api, 0, sizeof(uffdio_api));
1885 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
1886 goto out;
1887 ret = -EINVAL;
1888 goto out;
1889 }
1890
1891 uffdio_api.features = UFFD_API_FEATURES;
1892 uffdio_api.ioctls = UFFD_API_IOCTLS;
1893 ret = -EFAULT;
1894 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
1895 goto out;
1896 ctx->state = UFFD_STATE_RUNNING;
1897
1898 ctx->features = uffd_ctx_features(features);
1899 ret = 0;
1900out:
1901 return ret;
1902}
1903
1904static long userfaultfd_ioctl(struct file *file, unsigned cmd,
1905 unsigned long arg)
1906{
1907 int ret = -EINVAL;
1908 struct userfaultfd_ctx *ctx = file->private_data;
1909
1910 if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
1911 return -EINVAL;
1912
1913 switch(cmd) {
1914 case UFFDIO_API:
1915 ret = userfaultfd_api(ctx, arg);
1916 break;
1917 case UFFDIO_REGISTER:
1918 ret = userfaultfd_register(ctx, arg);
1919 break;
1920 case UFFDIO_UNREGISTER:
1921 ret = userfaultfd_unregister(ctx, arg);
1922 break;
1923 case UFFDIO_WAKE:
1924 ret = userfaultfd_wake(ctx, arg);
1925 break;
1926 case UFFDIO_COPY:
1927 ret = userfaultfd_copy(ctx, arg);
1928 break;
1929 case UFFDIO_ZEROPAGE:
1930 ret = userfaultfd_zeropage(ctx, arg);
1931 break;
1932 case UFFDIO_WRITEPROTECT:
1933 ret = userfaultfd_writeprotect(ctx, arg);
1934 break;
1935 }
1936 return ret;
1937}
1938
1939#ifdef CONFIG_PROC_FS
1940static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
1941{
1942 struct userfaultfd_ctx *ctx = f->private_data;
1943 wait_queue_entry_t *wq;
1944 unsigned long pending = 0, total = 0;
1945
1946 spin_lock(&ctx->fault_pending_wqh.lock);
1947 list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
1948 pending++;
1949 total++;
1950 }
1951 list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
1952 total++;
1953 }
1954 spin_unlock(&ctx->fault_pending_wqh.lock);
1955
1956
1957
1958
1959
1960
1961 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
1962 pending, total, UFFD_API, ctx->features,
1963 UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
1964}
1965#endif
1966
1967static const struct file_operations userfaultfd_fops = {
1968#ifdef CONFIG_PROC_FS
1969 .show_fdinfo = userfaultfd_show_fdinfo,
1970#endif
1971 .release = userfaultfd_release,
1972 .poll = userfaultfd_poll,
1973 .read = userfaultfd_read,
1974 .unlocked_ioctl = userfaultfd_ioctl,
1975 .compat_ioctl = compat_ptr_ioctl,
1976 .llseek = noop_llseek,
1977};
1978
1979static void init_once_userfaultfd_ctx(void *mem)
1980{
1981 struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
1982
1983 init_waitqueue_head(&ctx->fault_pending_wqh);
1984 init_waitqueue_head(&ctx->fault_wqh);
1985 init_waitqueue_head(&ctx->event_wqh);
1986 init_waitqueue_head(&ctx->fd_wqh);
1987 seqcount_init(&ctx->refile_seq);
1988}
1989
1990SYSCALL_DEFINE1(userfaultfd, int, flags)
1991{
1992 struct userfaultfd_ctx *ctx;
1993 int fd;
1994
1995 BUG_ON(!current->mm);
1996
1997
1998 BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
1999 BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2000
2001 if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
2002 return -EINVAL;
2003
2004 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
2005 if (!ctx)
2006 return -ENOMEM;
2007
2008 atomic_set(&ctx->refcount, 1);
2009 ctx->flags = flags;
2010 ctx->features = 0;
2011 ctx->state = UFFD_STATE_WAIT_API;
2012 ctx->released = false;
2013 ctx->mmap_changing = false;
2014 ctx->mm = current->mm;
2015
2016 mmgrab(ctx->mm);
2017
2018 fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
2019 O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
2020 if (fd < 0) {
2021 mmdrop(ctx->mm);
2022 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
2023 }
2024 return fd;
2025}
2026
2027static int __init userfaultfd_init(void)
2028{
2029 userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
2030 sizeof(struct userfaultfd_ctx),
2031 0,
2032 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2033 init_once_userfaultfd_ctx);
2034 return 0;
2035}
2036__initcall(userfaultfd_init);
2037