1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/list.h>
16#include <linux/hashtable.h>
17#include <linux/sched/signal.h>
18#include <linux/sched/mm.h>
19#include <linux/mm.h>
20#include <linux/poll.h>
21#include <linux/slab.h>
22#include <linux/seq_file.h>
23#include <linux/file.h>
24#include <linux/bug.h>
25#include <linux/anon_inodes.h>
26#include <linux/syscalls.h>
27#include <linux/userfaultfd_k.h>
28#include <linux/mempolicy.h>
29#include <linux/ioctl.h>
30#include <linux/security.h>
31#include <linux/hugetlb.h>
32
33static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
34
35enum userfaultfd_state {
36 UFFD_STATE_WAIT_API,
37 UFFD_STATE_RUNNING,
38};
39
40
41
42
43
44struct userfaultfd_ctx {
45
46 wait_queue_head_t fault_pending_wqh;
47
48 wait_queue_head_t fault_wqh;
49
50 wait_queue_head_t fd_wqh;
51
52 wait_queue_head_t event_wqh;
53
54 struct seqcount refile_seq;
55
56 atomic_t refcount;
57
58 unsigned int flags;
59
60 unsigned int features;
61
62 enum userfaultfd_state state;
63
64 bool released;
65
66 bool mmap_changing;
67
68 struct mm_struct *mm;
69};
70
71struct userfaultfd_fork_ctx {
72 struct userfaultfd_ctx *orig;
73 struct userfaultfd_ctx *new;
74 struct list_head list;
75};
76
77struct userfaultfd_unmap_ctx {
78 struct userfaultfd_ctx *ctx;
79 unsigned long start;
80 unsigned long end;
81 struct list_head list;
82};
83
84struct userfaultfd_wait_queue {
85 struct uffd_msg msg;
86 wait_queue_entry_t wq;
87 struct userfaultfd_ctx *ctx;
88 bool waken;
89};
90
91struct userfaultfd_wake_range {
92 unsigned long start;
93 unsigned long len;
94};
95
96static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
97 int wake_flags, void *key)
98{
99 struct userfaultfd_wake_range *range = key;
100 int ret;
101 struct userfaultfd_wait_queue *uwq;
102 unsigned long start, len;
103
104 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
105 ret = 0;
106
107 start = range->start;
108 len = range->len;
109 if (len && (start > uwq->msg.arg.pagefault.address ||
110 start + len <= uwq->msg.arg.pagefault.address))
111 goto out;
112 WRITE_ONCE(uwq->waken, true);
113
114
115
116
117 ret = wake_up_state(wq->private, mode);
118 if (ret) {
119
120
121
122
123
124
125
126
127
128
129
130 list_del_init(&wq->entry);
131 }
132out:
133 return ret;
134}
135
136
137
138
139
140
141static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
142{
143 if (!atomic_inc_not_zero(&ctx->refcount))
144 BUG();
145}
146
147
148
149
150
151
152
153
154
155static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
156{
157 if (atomic_dec_and_test(&ctx->refcount)) {
158 VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
159 VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
160 VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
161 VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
162 VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
163 VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
164 VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
165 VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
166 mmdrop(ctx->mm);
167 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
168 }
169}
170
171static inline void msg_init(struct uffd_msg *msg)
172{
173 BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
174
175
176
177
178 memset(msg, 0, sizeof(struct uffd_msg));
179}
180
181static inline struct uffd_msg userfault_msg(unsigned long address,
182 unsigned int flags,
183 unsigned long reason,
184 unsigned int features)
185{
186 struct uffd_msg msg;
187 msg_init(&msg);
188 msg.event = UFFD_EVENT_PAGEFAULT;
189 msg.arg.pagefault.address = address;
190 if (flags & FAULT_FLAG_WRITE)
191
192
193
194
195
196
197
198 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
199 if (reason & VM_UFFD_WP)
200
201
202
203
204
205
206
207 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
208 if (features & UFFD_FEATURE_THREAD_ID)
209 msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
210 return msg;
211}
212
213#ifdef CONFIG_HUGETLB_PAGE
214
215
216
217
218static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
219 struct vm_area_struct *vma,
220 unsigned long address,
221 unsigned long flags,
222 unsigned long reason)
223{
224 struct mm_struct *mm = ctx->mm;
225 pte_t *ptep, pte;
226 bool ret = true;
227
228 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
229
230 ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
231
232 if (!ptep)
233 goto out;
234
235 ret = false;
236 pte = huge_ptep_get(ptep);
237
238
239
240
241
242 if (huge_pte_none(pte))
243 ret = true;
244 if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
245 ret = true;
246out:
247 return ret;
248}
249#else
250static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
251 struct vm_area_struct *vma,
252 unsigned long address,
253 unsigned long flags,
254 unsigned long reason)
255{
256 return false;
257}
258#endif
259
260
261
262
263
264
265
266
267static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
268 unsigned long address,
269 unsigned long flags,
270 unsigned long reason)
271{
272 struct mm_struct *mm = ctx->mm;
273 pgd_t *pgd;
274 p4d_t *p4d;
275 pud_t *pud;
276 pmd_t *pmd, _pmd;
277 pte_t *pte;
278 bool ret = true;
279
280 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
281
282 pgd = pgd_offset(mm, address);
283 if (!pgd_present(*pgd))
284 goto out;
285 p4d = p4d_offset(pgd, address);
286 if (!p4d_present(*p4d))
287 goto out;
288 pud = pud_offset(p4d, address);
289 if (!pud_present(*pud))
290 goto out;
291 pmd = pmd_offset(pud, address);
292
293
294
295
296
297
298
299
300 _pmd = READ_ONCE(*pmd);
301 if (pmd_none(_pmd))
302 goto out;
303
304 ret = false;
305 if (!pmd_present(_pmd))
306 goto out;
307
308 if (pmd_trans_huge(_pmd))
309 goto out;
310
311
312
313
314
315 pte = pte_offset_map(pmd, address);
316
317
318
319
320 if (pte_none(*pte))
321 ret = true;
322 pte_unmap(pte);
323
324out:
325 return ret;
326}
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
344{
345 struct mm_struct *mm = vmf->vma->vm_mm;
346 struct userfaultfd_ctx *ctx;
347 struct userfaultfd_wait_queue uwq;
348 vm_fault_t ret = VM_FAULT_SIGBUS;
349 bool must_wait, return_to_userland;
350 long blocking_state;
351
352
353
354
355
356
357
358
359
360
361
362
363 if (current->flags & (PF_EXITING|PF_DUMPCORE))
364 goto out;
365
366
367
368
369
370 WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
371
372 ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
373 if (!ctx)
374 goto out;
375
376 BUG_ON(ctx->mm != mm);
377
378 VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
379 VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
380
381 if (ctx->features & UFFD_FEATURE_SIGBUS)
382 goto out;
383
384
385
386
387
388
389 if (unlikely(READ_ONCE(ctx->released))) {
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406 ret = VM_FAULT_NOPAGE;
407 goto out;
408 }
409
410
411
412
413
414
415
416
417
418
419
420 if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
421
422
423
424
425
426 BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
427#ifdef CONFIG_DEBUG_VM
428 if (printk_ratelimit()) {
429 printk(KERN_WARNING
430 "FAULT_FLAG_ALLOW_RETRY missing %x\n",
431 vmf->flags);
432 dump_stack();
433 }
434#endif
435 goto out;
436 }
437
438
439
440
441
442 ret = VM_FAULT_RETRY;
443 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
444 goto out;
445
446
447 userfaultfd_ctx_get(ctx);
448
449 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
450 uwq.wq.private = current;
451 uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
452 ctx->features);
453 uwq.ctx = ctx;
454 uwq.waken = false;
455
456 return_to_userland =
457 (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
458 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
459 blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
460 TASK_KILLABLE;
461
462 spin_lock(&ctx->fault_pending_wqh.lock);
463
464
465
466
467 __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
468
469
470
471
472
473 set_current_state(blocking_state);
474 spin_unlock(&ctx->fault_pending_wqh.lock);
475
476 if (!is_vm_hugetlb_page(vmf->vma))
477 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
478 reason);
479 else
480 must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
481 vmf->address,
482 vmf->flags, reason);
483 up_read(&mm->mmap_sem);
484
485 if (likely(must_wait && !READ_ONCE(ctx->released) &&
486 (return_to_userland ? !signal_pending(current) :
487 !fatal_signal_pending(current)))) {
488 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
489 schedule();
490 ret |= VM_FAULT_MAJOR;
491
492
493
494
495
496
497
498
499 while (!READ_ONCE(uwq.waken)) {
500
501
502
503
504
505
506 set_current_state(blocking_state);
507 if (READ_ONCE(uwq.waken) ||
508 READ_ONCE(ctx->released) ||
509 (return_to_userland ? signal_pending(current) :
510 fatal_signal_pending(current)))
511 break;
512 schedule();
513 }
514 }
515
516 __set_current_state(TASK_RUNNING);
517
518 if (return_to_userland) {
519 if (signal_pending(current) &&
520 !fatal_signal_pending(current)) {
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537 down_read(&mm->mmap_sem);
538 ret = VM_FAULT_NOPAGE;
539 }
540 }
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555 if (!list_empty_careful(&uwq.wq.entry)) {
556 spin_lock(&ctx->fault_pending_wqh.lock);
557
558
559
560
561 list_del(&uwq.wq.entry);
562 spin_unlock(&ctx->fault_pending_wqh.lock);
563 }
564
565
566
567
568
569 userfaultfd_ctx_put(ctx);
570
571out:
572 return ret;
573}
574
575static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
576 struct userfaultfd_wait_queue *ewq)
577{
578 struct userfaultfd_ctx *release_new_ctx;
579
580 if (WARN_ON_ONCE(current->flags & PF_EXITING))
581 goto out;
582
583 ewq->ctx = ctx;
584 init_waitqueue_entry(&ewq->wq, current);
585 release_new_ctx = NULL;
586
587 spin_lock(&ctx->event_wqh.lock);
588
589
590
591
592 __add_wait_queue(&ctx->event_wqh, &ewq->wq);
593 for (;;) {
594 set_current_state(TASK_KILLABLE);
595 if (ewq->msg.event == 0)
596 break;
597 if (READ_ONCE(ctx->released) ||
598 fatal_signal_pending(current)) {
599
600
601
602
603
604
605 __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
606 if (ewq->msg.event == UFFD_EVENT_FORK) {
607 struct userfaultfd_ctx *new;
608
609 new = (struct userfaultfd_ctx *)
610 (unsigned long)
611 ewq->msg.arg.reserved.reserved1;
612 release_new_ctx = new;
613 }
614 break;
615 }
616
617 spin_unlock(&ctx->event_wqh.lock);
618
619 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
620 schedule();
621
622 spin_lock(&ctx->event_wqh.lock);
623 }
624 __set_current_state(TASK_RUNNING);
625 spin_unlock(&ctx->event_wqh.lock);
626
627 if (release_new_ctx) {
628 struct vm_area_struct *vma;
629 struct mm_struct *mm = release_new_ctx->mm;
630
631
632 down_write(&mm->mmap_sem);
633
634 VM_WARN_ON(!mmget_still_valid(mm));
635 for (vma = mm->mmap; vma; vma = vma->vm_next)
636 if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
637 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
638 vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
639 }
640 up_write(&mm->mmap_sem);
641
642 userfaultfd_ctx_put(release_new_ctx);
643 }
644
645
646
647
648
649out:
650 WRITE_ONCE(ctx->mmap_changing, false);
651 userfaultfd_ctx_put(ctx);
652}
653
654static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
655 struct userfaultfd_wait_queue *ewq)
656{
657 ewq->msg.event = 0;
658 wake_up_locked(&ctx->event_wqh);
659 __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
660}
661
662int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
663{
664 struct userfaultfd_ctx *ctx = NULL, *octx;
665 struct userfaultfd_fork_ctx *fctx;
666
667 octx = vma->vm_userfaultfd_ctx.ctx;
668 if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
669 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
670 vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
671 return 0;
672 }
673
674 list_for_each_entry(fctx, fcs, list)
675 if (fctx->orig == octx) {
676 ctx = fctx->new;
677 break;
678 }
679
680 if (!ctx) {
681 fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
682 if (!fctx)
683 return -ENOMEM;
684
685 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
686 if (!ctx) {
687 kfree(fctx);
688 return -ENOMEM;
689 }
690
691 atomic_set(&ctx->refcount, 1);
692 ctx->flags = octx->flags;
693 ctx->state = UFFD_STATE_RUNNING;
694 ctx->features = octx->features;
695 ctx->released = false;
696 ctx->mmap_changing = false;
697 ctx->mm = vma->vm_mm;
698 mmgrab(ctx->mm);
699
700 userfaultfd_ctx_get(octx);
701 WRITE_ONCE(octx->mmap_changing, true);
702 fctx->orig = octx;
703 fctx->new = ctx;
704 list_add_tail(&fctx->list, fcs);
705 }
706
707 vma->vm_userfaultfd_ctx.ctx = ctx;
708 return 0;
709}
710
711static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
712{
713 struct userfaultfd_ctx *ctx = fctx->orig;
714 struct userfaultfd_wait_queue ewq;
715
716 msg_init(&ewq.msg);
717
718 ewq.msg.event = UFFD_EVENT_FORK;
719 ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
720
721 userfaultfd_event_wait_completion(ctx, &ewq);
722}
723
724void dup_userfaultfd_complete(struct list_head *fcs)
725{
726 struct userfaultfd_fork_ctx *fctx, *n;
727
728 list_for_each_entry_safe(fctx, n, fcs, list) {
729 dup_fctx(fctx);
730 list_del(&fctx->list);
731 kfree(fctx);
732 }
733}
734
735void mremap_userfaultfd_prep(struct vm_area_struct *vma,
736 struct vm_userfaultfd_ctx *vm_ctx)
737{
738 struct userfaultfd_ctx *ctx;
739
740 ctx = vma->vm_userfaultfd_ctx.ctx;
741
742 if (!ctx)
743 return;
744
745 if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
746 vm_ctx->ctx = ctx;
747 userfaultfd_ctx_get(ctx);
748 WRITE_ONCE(ctx->mmap_changing, true);
749 } else {
750
751 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
752 vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
753 }
754}
755
756void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
757 unsigned long from, unsigned long to,
758 unsigned long len)
759{
760 struct userfaultfd_ctx *ctx = vm_ctx->ctx;
761 struct userfaultfd_wait_queue ewq;
762
763 if (!ctx)
764 return;
765
766 if (to & ~PAGE_MASK) {
767 userfaultfd_ctx_put(ctx);
768 return;
769 }
770
771 msg_init(&ewq.msg);
772
773 ewq.msg.event = UFFD_EVENT_REMAP;
774 ewq.msg.arg.remap.from = from;
775 ewq.msg.arg.remap.to = to;
776 ewq.msg.arg.remap.len = len;
777
778 userfaultfd_event_wait_completion(ctx, &ewq);
779}
780
781bool userfaultfd_remove(struct vm_area_struct *vma,
782 unsigned long start, unsigned long end)
783{
784 struct mm_struct *mm = vma->vm_mm;
785 struct userfaultfd_ctx *ctx;
786 struct userfaultfd_wait_queue ewq;
787
788 ctx = vma->vm_userfaultfd_ctx.ctx;
789 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
790 return true;
791
792 userfaultfd_ctx_get(ctx);
793 WRITE_ONCE(ctx->mmap_changing, true);
794 up_read(&mm->mmap_sem);
795
796 msg_init(&ewq.msg);
797
798 ewq.msg.event = UFFD_EVENT_REMOVE;
799 ewq.msg.arg.remove.start = start;
800 ewq.msg.arg.remove.end = end;
801
802 userfaultfd_event_wait_completion(ctx, &ewq);
803
804 return false;
805}
806
807static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
808 unsigned long start, unsigned long end)
809{
810 struct userfaultfd_unmap_ctx *unmap_ctx;
811
812 list_for_each_entry(unmap_ctx, unmaps, list)
813 if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
814 unmap_ctx->end == end)
815 return true;
816
817 return false;
818}
819
820int userfaultfd_unmap_prep(struct vm_area_struct *vma,
821 unsigned long start, unsigned long end,
822 struct list_head *unmaps)
823{
824 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
825 struct userfaultfd_unmap_ctx *unmap_ctx;
826 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
827
828 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
829 has_unmap_ctx(ctx, unmaps, start, end))
830 continue;
831
832 unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
833 if (!unmap_ctx)
834 return -ENOMEM;
835
836 userfaultfd_ctx_get(ctx);
837 WRITE_ONCE(ctx->mmap_changing, true);
838 unmap_ctx->ctx = ctx;
839 unmap_ctx->start = start;
840 unmap_ctx->end = end;
841 list_add_tail(&unmap_ctx->list, unmaps);
842 }
843
844 return 0;
845}
846
847void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
848{
849 struct userfaultfd_unmap_ctx *ctx, *n;
850 struct userfaultfd_wait_queue ewq;
851
852 list_for_each_entry_safe(ctx, n, uf, list) {
853 msg_init(&ewq.msg);
854
855 ewq.msg.event = UFFD_EVENT_UNMAP;
856 ewq.msg.arg.remove.start = ctx->start;
857 ewq.msg.arg.remove.end = ctx->end;
858
859 userfaultfd_event_wait_completion(ctx->ctx, &ewq);
860
861 list_del(&ctx->list);
862 kfree(ctx);
863 }
864}
865
866static int userfaultfd_release(struct inode *inode, struct file *file)
867{
868 struct userfaultfd_ctx *ctx = file->private_data;
869 struct mm_struct *mm = ctx->mm;
870 struct vm_area_struct *vma, *prev;
871
872 struct userfaultfd_wake_range range = { .len = 0, };
873 unsigned long new_flags;
874 bool still_valid;
875
876 WRITE_ONCE(ctx->released, true);
877
878 if (!mmget_not_zero(mm))
879 goto wakeup;
880
881
882
883
884
885
886
887
888
889 down_write(&mm->mmap_sem);
890 still_valid = mmget_still_valid(mm);
891 prev = NULL;
892 for (vma = mm->mmap; vma; vma = vma->vm_next) {
893 cond_resched();
894 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
895 !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
896 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
897 prev = vma;
898 continue;
899 }
900 new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
901 if (still_valid) {
902 prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
903 new_flags, vma->anon_vma,
904 vma->vm_file, vma->vm_pgoff,
905 vma_policy(vma),
906 NULL_VM_UFFD_CTX);
907 if (prev)
908 vma = prev;
909 else
910 prev = vma;
911 }
912 vma->vm_flags = new_flags;
913 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
914 }
915 up_write(&mm->mmap_sem);
916 mmput(mm);
917wakeup:
918
919
920
921
922
923 spin_lock(&ctx->fault_pending_wqh.lock);
924 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
925 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
926 spin_unlock(&ctx->fault_pending_wqh.lock);
927
928
929 wake_up_all(&ctx->event_wqh);
930
931 wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
932 userfaultfd_ctx_put(ctx);
933 return 0;
934}
935
936
937static inline struct userfaultfd_wait_queue *find_userfault_in(
938 wait_queue_head_t *wqh)
939{
940 wait_queue_entry_t *wq;
941 struct userfaultfd_wait_queue *uwq;
942
943 lockdep_assert_held(&wqh->lock);
944
945 uwq = NULL;
946 if (!waitqueue_active(wqh))
947 goto out;
948
949 wq = list_last_entry(&wqh->head, typeof(*wq), entry);
950 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
951out:
952 return uwq;
953}
954
955static inline struct userfaultfd_wait_queue *find_userfault(
956 struct userfaultfd_ctx *ctx)
957{
958 return find_userfault_in(&ctx->fault_pending_wqh);
959}
960
961static inline struct userfaultfd_wait_queue *find_userfault_evt(
962 struct userfaultfd_ctx *ctx)
963{
964 return find_userfault_in(&ctx->event_wqh);
965}
966
967static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
968{
969 struct userfaultfd_ctx *ctx = file->private_data;
970 __poll_t ret;
971
972 poll_wait(file, &ctx->fd_wqh, wait);
973
974 switch (ctx->state) {
975 case UFFD_STATE_WAIT_API:
976 return EPOLLERR;
977 case UFFD_STATE_RUNNING:
978
979
980
981
982 if (unlikely(!(file->f_flags & O_NONBLOCK)))
983 return EPOLLERR;
984
985
986
987
988
989
990
991
992
993
994 ret = 0;
995 smp_mb();
996 if (waitqueue_active(&ctx->fault_pending_wqh))
997 ret = EPOLLIN;
998 else if (waitqueue_active(&ctx->event_wqh))
999 ret = EPOLLIN;
1000
1001 return ret;
1002 default:
1003 WARN_ON_ONCE(1);
1004 return EPOLLERR;
1005 }
1006}
1007
1008static const struct file_operations userfaultfd_fops;
1009
1010static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
1011 struct userfaultfd_ctx *new,
1012 struct uffd_msg *msg)
1013{
1014 int fd;
1015
1016 fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
1017 O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
1018 if (fd < 0)
1019 return fd;
1020
1021 msg->arg.reserved.reserved1 = 0;
1022 msg->arg.fork.ufd = fd;
1023 return 0;
1024}
1025
1026static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1027 struct uffd_msg *msg)
1028{
1029 ssize_t ret;
1030 DECLARE_WAITQUEUE(wait, current);
1031 struct userfaultfd_wait_queue *uwq;
1032
1033
1034
1035
1036
1037
1038
1039 LIST_HEAD(fork_event);
1040 struct userfaultfd_ctx *fork_nctx = NULL;
1041
1042
1043 spin_lock_irq(&ctx->fd_wqh.lock);
1044 __add_wait_queue(&ctx->fd_wqh, &wait);
1045 for (;;) {
1046 set_current_state(TASK_INTERRUPTIBLE);
1047 spin_lock(&ctx->fault_pending_wqh.lock);
1048 uwq = find_userfault(ctx);
1049 if (uwq) {
1050
1051
1052
1053
1054
1055
1056
1057 write_seqcount_begin(&ctx->refile_seq);
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080 list_del(&uwq->wq.entry);
1081 __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
1082
1083 write_seqcount_end(&ctx->refile_seq);
1084
1085
1086 *msg = uwq->msg;
1087 spin_unlock(&ctx->fault_pending_wqh.lock);
1088 ret = 0;
1089 break;
1090 }
1091 spin_unlock(&ctx->fault_pending_wqh.lock);
1092
1093 spin_lock(&ctx->event_wqh.lock);
1094 uwq = find_userfault_evt(ctx);
1095 if (uwq) {
1096 *msg = uwq->msg;
1097
1098 if (uwq->msg.event == UFFD_EVENT_FORK) {
1099 fork_nctx = (struct userfaultfd_ctx *)
1100 (unsigned long)
1101 uwq->msg.arg.reserved.reserved1;
1102 list_move(&uwq->wq.entry, &fork_event);
1103
1104
1105
1106
1107
1108 userfaultfd_ctx_get(fork_nctx);
1109 spin_unlock(&ctx->event_wqh.lock);
1110 ret = 0;
1111 break;
1112 }
1113
1114 userfaultfd_event_complete(ctx, uwq);
1115 spin_unlock(&ctx->event_wqh.lock);
1116 ret = 0;
1117 break;
1118 }
1119 spin_unlock(&ctx->event_wqh.lock);
1120
1121 if (signal_pending(current)) {
1122 ret = -ERESTARTSYS;
1123 break;
1124 }
1125 if (no_wait) {
1126 ret = -EAGAIN;
1127 break;
1128 }
1129 spin_unlock_irq(&ctx->fd_wqh.lock);
1130 schedule();
1131 spin_lock_irq(&ctx->fd_wqh.lock);
1132 }
1133 __remove_wait_queue(&ctx->fd_wqh, &wait);
1134 __set_current_state(TASK_RUNNING);
1135 spin_unlock_irq(&ctx->fd_wqh.lock);
1136
1137 if (!ret && msg->event == UFFD_EVENT_FORK) {
1138 ret = resolve_userfault_fork(ctx, fork_nctx, msg);
1139 spin_lock(&ctx->event_wqh.lock);
1140 if (!list_empty(&fork_event)) {
1141
1142
1143
1144
1145 userfaultfd_ctx_put(fork_nctx);
1146
1147 uwq = list_first_entry(&fork_event,
1148 typeof(*uwq),
1149 wq.entry);
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160 list_del(&uwq->wq.entry);
1161 __add_wait_queue(&ctx->event_wqh, &uwq->wq);
1162
1163
1164
1165
1166
1167
1168 if (likely(!ret))
1169 userfaultfd_event_complete(ctx, uwq);
1170 } else {
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182 if (ret)
1183 userfaultfd_ctx_put(fork_nctx);
1184 }
1185 spin_unlock(&ctx->event_wqh.lock);
1186 }
1187
1188 return ret;
1189}
1190
1191static ssize_t userfaultfd_read(struct file *file, char __user *buf,
1192 size_t count, loff_t *ppos)
1193{
1194 struct userfaultfd_ctx *ctx = file->private_data;
1195 ssize_t _ret, ret = 0;
1196 struct uffd_msg msg;
1197 int no_wait = file->f_flags & O_NONBLOCK;
1198
1199 if (ctx->state == UFFD_STATE_WAIT_API)
1200 return -EINVAL;
1201
1202 for (;;) {
1203 if (count < sizeof(msg))
1204 return ret ? ret : -EINVAL;
1205 _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
1206 if (_ret < 0)
1207 return ret ? ret : _ret;
1208 if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
1209 return ret ? ret : -EFAULT;
1210 ret += sizeof(msg);
1211 buf += sizeof(msg);
1212 count -= sizeof(msg);
1213
1214
1215
1216
1217 no_wait = O_NONBLOCK;
1218 }
1219}
1220
1221static void __wake_userfault(struct userfaultfd_ctx *ctx,
1222 struct userfaultfd_wake_range *range)
1223{
1224 spin_lock(&ctx->fault_pending_wqh.lock);
1225
1226 if (waitqueue_active(&ctx->fault_pending_wqh))
1227 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
1228 range);
1229 if (waitqueue_active(&ctx->fault_wqh))
1230 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range);
1231 spin_unlock(&ctx->fault_pending_wqh.lock);
1232}
1233
1234static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1235 struct userfaultfd_wake_range *range)
1236{
1237 unsigned seq;
1238 bool need_wakeup;
1239
1240
1241
1242
1243
1244
1245
1246
1247 smp_mb();
1248
1249
1250
1251
1252
1253
1254
1255 do {
1256 seq = read_seqcount_begin(&ctx->refile_seq);
1257 need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
1258 waitqueue_active(&ctx->fault_wqh);
1259 cond_resched();
1260 } while (read_seqcount_retry(&ctx->refile_seq, seq));
1261 if (need_wakeup)
1262 __wake_userfault(ctx, range);
1263}
1264
1265static __always_inline int validate_range(struct mm_struct *mm,
1266 __u64 start, __u64 len)
1267{
1268 __u64 task_size = mm->task_size;
1269
1270 if (start & ~PAGE_MASK)
1271 return -EINVAL;
1272 if (len & ~PAGE_MASK)
1273 return -EINVAL;
1274 if (!len)
1275 return -EINVAL;
1276 if (start < mmap_min_addr)
1277 return -EINVAL;
1278 if (start >= task_size)
1279 return -EINVAL;
1280 if (len > task_size - start)
1281 return -EINVAL;
1282 return 0;
1283}
1284
1285static inline bool vma_can_userfault(struct vm_area_struct *vma)
1286{
1287 return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
1288 vma_is_shmem(vma);
1289}
1290
1291static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1292 unsigned long arg)
1293{
1294 struct mm_struct *mm = ctx->mm;
1295 struct vm_area_struct *vma, *prev, *cur;
1296 int ret;
1297 struct uffdio_register uffdio_register;
1298 struct uffdio_register __user *user_uffdio_register;
1299 unsigned long vm_flags, new_flags;
1300 bool found;
1301 bool basic_ioctls;
1302 unsigned long start, end, vma_end;
1303
1304 user_uffdio_register = (struct uffdio_register __user *) arg;
1305
1306 ret = -EFAULT;
1307 if (copy_from_user(&uffdio_register, user_uffdio_register,
1308 sizeof(uffdio_register)-sizeof(__u64)))
1309 goto out;
1310
1311 ret = -EINVAL;
1312 if (!uffdio_register.mode)
1313 goto out;
1314 if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
1315 UFFDIO_REGISTER_MODE_WP))
1316 goto out;
1317 vm_flags = 0;
1318 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1319 vm_flags |= VM_UFFD_MISSING;
1320 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1321 vm_flags |= VM_UFFD_WP;
1322
1323
1324
1325
1326 ret = -EINVAL;
1327 goto out;
1328 }
1329
1330 ret = validate_range(mm, uffdio_register.range.start,
1331 uffdio_register.range.len);
1332 if (ret)
1333 goto out;
1334
1335 start = uffdio_register.range.start;
1336 end = start + uffdio_register.range.len;
1337
1338 ret = -ENOMEM;
1339 if (!mmget_not_zero(mm))
1340 goto out;
1341
1342 down_write(&mm->mmap_sem);
1343 if (!mmget_still_valid(mm))
1344 goto out_unlock;
1345 vma = find_vma_prev(mm, start, &prev);
1346 if (!vma)
1347 goto out_unlock;
1348
1349
1350 ret = -EINVAL;
1351 if (vma->vm_start >= end)
1352 goto out_unlock;
1353
1354
1355
1356
1357
1358 if (is_vm_hugetlb_page(vma)) {
1359 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1360
1361 if (start & (vma_hpagesize - 1))
1362 goto out_unlock;
1363 }
1364
1365
1366
1367
1368 found = false;
1369 basic_ioctls = false;
1370 for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
1371 cond_resched();
1372
1373 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1374 !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
1375
1376
1377 ret = -EINVAL;
1378 if (!vma_can_userfault(cur))
1379 goto out_unlock;
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389 ret = -EPERM;
1390 if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1391 goto out_unlock;
1392
1393
1394
1395
1396
1397 if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
1398 end > cur->vm_start) {
1399 unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
1400
1401 ret = -EINVAL;
1402
1403 if (end & (vma_hpagesize - 1))
1404 goto out_unlock;
1405 }
1406
1407
1408
1409
1410
1411
1412
1413 ret = -EBUSY;
1414 if (cur->vm_userfaultfd_ctx.ctx &&
1415 cur->vm_userfaultfd_ctx.ctx != ctx)
1416 goto out_unlock;
1417
1418
1419
1420
1421 if (is_vm_hugetlb_page(cur))
1422 basic_ioctls = true;
1423
1424 found = true;
1425 }
1426 BUG_ON(!found);
1427
1428 if (vma->vm_start < start)
1429 prev = vma;
1430
1431 ret = 0;
1432 do {
1433 cond_resched();
1434
1435 BUG_ON(!vma_can_userfault(vma));
1436 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1437 vma->vm_userfaultfd_ctx.ctx != ctx);
1438 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1439
1440
1441
1442
1443
1444 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1445 (vma->vm_flags & vm_flags) == vm_flags)
1446 goto skip;
1447
1448 if (vma->vm_start > start)
1449 start = vma->vm_start;
1450 vma_end = min(end, vma->vm_end);
1451
1452 new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
1453 prev = vma_merge(mm, prev, start, vma_end, new_flags,
1454 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
1455 vma_policy(vma),
1456 ((struct vm_userfaultfd_ctx){ ctx }));
1457 if (prev) {
1458 vma = prev;
1459 goto next;
1460 }
1461 if (vma->vm_start < start) {
1462 ret = split_vma(mm, vma, start, 1);
1463 if (ret)
1464 break;
1465 }
1466 if (vma->vm_end > end) {
1467 ret = split_vma(mm, vma, end, 0);
1468 if (ret)
1469 break;
1470 }
1471 next:
1472
1473
1474
1475
1476
1477 vma->vm_flags = new_flags;
1478 vma->vm_userfaultfd_ctx.ctx = ctx;
1479
1480 skip:
1481 prev = vma;
1482 start = vma->vm_end;
1483 vma = vma->vm_next;
1484 } while (vma && vma->vm_start < end);
1485out_unlock:
1486 up_write(&mm->mmap_sem);
1487 mmput(mm);
1488 if (!ret) {
1489
1490
1491
1492
1493
1494 if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1495 UFFD_API_RANGE_IOCTLS,
1496 &user_uffdio_register->ioctls))
1497 ret = -EFAULT;
1498 }
1499out:
1500 return ret;
1501}
1502
1503static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1504 unsigned long arg)
1505{
1506 struct mm_struct *mm = ctx->mm;
1507 struct vm_area_struct *vma, *prev, *cur;
1508 int ret;
1509 struct uffdio_range uffdio_unregister;
1510 unsigned long new_flags;
1511 bool found;
1512 unsigned long start, end, vma_end;
1513 const void __user *buf = (void __user *)arg;
1514
1515 ret = -EFAULT;
1516 if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
1517 goto out;
1518
1519 ret = validate_range(mm, uffdio_unregister.start,
1520 uffdio_unregister.len);
1521 if (ret)
1522 goto out;
1523
1524 start = uffdio_unregister.start;
1525 end = start + uffdio_unregister.len;
1526
1527 ret = -ENOMEM;
1528 if (!mmget_not_zero(mm))
1529 goto out;
1530
1531 down_write(&mm->mmap_sem);
1532 if (!mmget_still_valid(mm))
1533 goto out_unlock;
1534 vma = find_vma_prev(mm, start, &prev);
1535 if (!vma)
1536 goto out_unlock;
1537
1538
1539 ret = -EINVAL;
1540 if (vma->vm_start >= end)
1541 goto out_unlock;
1542
1543
1544
1545
1546
1547 if (is_vm_hugetlb_page(vma)) {
1548 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1549
1550 if (start & (vma_hpagesize - 1))
1551 goto out_unlock;
1552 }
1553
1554
1555
1556
1557 found = false;
1558 ret = -EINVAL;
1559 for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
1560 cond_resched();
1561
1562 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1563 !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
1564
1565
1566
1567
1568
1569
1570
1571
1572 if (!vma_can_userfault(cur))
1573 goto out_unlock;
1574
1575 found = true;
1576 }
1577 BUG_ON(!found);
1578
1579 if (vma->vm_start < start)
1580 prev = vma;
1581
1582 ret = 0;
1583 do {
1584 cond_resched();
1585
1586 BUG_ON(!vma_can_userfault(vma));
1587
1588
1589
1590
1591
1592 if (!vma->vm_userfaultfd_ctx.ctx)
1593 goto skip;
1594
1595 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1596
1597 if (vma->vm_start > start)
1598 start = vma->vm_start;
1599 vma_end = min(end, vma->vm_end);
1600
1601 if (userfaultfd_missing(vma)) {
1602
1603
1604
1605
1606
1607
1608 struct userfaultfd_wake_range range;
1609 range.start = start;
1610 range.len = vma_end - start;
1611 wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
1612 }
1613
1614 new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
1615 prev = vma_merge(mm, prev, start, vma_end, new_flags,
1616 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
1617 vma_policy(vma),
1618 NULL_VM_UFFD_CTX);
1619 if (prev) {
1620 vma = prev;
1621 goto next;
1622 }
1623 if (vma->vm_start < start) {
1624 ret = split_vma(mm, vma, start, 1);
1625 if (ret)
1626 break;
1627 }
1628 if (vma->vm_end > end) {
1629 ret = split_vma(mm, vma, end, 0);
1630 if (ret)
1631 break;
1632 }
1633 next:
1634
1635
1636
1637
1638
1639 vma->vm_flags = new_flags;
1640 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1641
1642 skip:
1643 prev = vma;
1644 start = vma->vm_end;
1645 vma = vma->vm_next;
1646 } while (vma && vma->vm_start < end);
1647out_unlock:
1648 up_write(&mm->mmap_sem);
1649 mmput(mm);
1650out:
1651 return ret;
1652}
1653
1654
1655
1656
1657
1658static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1659 unsigned long arg)
1660{
1661 int ret;
1662 struct uffdio_range uffdio_wake;
1663 struct userfaultfd_wake_range range;
1664 const void __user *buf = (void __user *)arg;
1665
1666 ret = -EFAULT;
1667 if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1668 goto out;
1669
1670 ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
1671 if (ret)
1672 goto out;
1673
1674 range.start = uffdio_wake.start;
1675 range.len = uffdio_wake.len;
1676
1677
1678
1679
1680
1681 VM_BUG_ON(!range.len);
1682
1683 wake_userfault(ctx, &range);
1684 ret = 0;
1685
1686out:
1687 return ret;
1688}
1689
1690static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1691 unsigned long arg)
1692{
1693 __s64 ret;
1694 struct uffdio_copy uffdio_copy;
1695 struct uffdio_copy __user *user_uffdio_copy;
1696 struct userfaultfd_wake_range range;
1697
1698 user_uffdio_copy = (struct uffdio_copy __user *) arg;
1699
1700 ret = -EAGAIN;
1701 if (READ_ONCE(ctx->mmap_changing))
1702 goto out;
1703
1704 ret = -EFAULT;
1705 if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1706
1707 sizeof(uffdio_copy)-sizeof(__s64)))
1708 goto out;
1709
1710 ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
1711 if (ret)
1712 goto out;
1713
1714
1715
1716
1717
1718 ret = -EINVAL;
1719 if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
1720 goto out;
1721 if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
1722 goto out;
1723 if (mmget_not_zero(ctx->mm)) {
1724 ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1725 uffdio_copy.len, &ctx->mmap_changing);
1726 mmput(ctx->mm);
1727 } else {
1728 return -ESRCH;
1729 }
1730 if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1731 return -EFAULT;
1732 if (ret < 0)
1733 goto out;
1734 BUG_ON(!ret);
1735
1736 range.len = ret;
1737 if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1738 range.start = uffdio_copy.dst;
1739 wake_userfault(ctx, &range);
1740 }
1741 ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1742out:
1743 return ret;
1744}
1745
1746static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1747 unsigned long arg)
1748{
1749 __s64 ret;
1750 struct uffdio_zeropage uffdio_zeropage;
1751 struct uffdio_zeropage __user *user_uffdio_zeropage;
1752 struct userfaultfd_wake_range range;
1753
1754 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1755
1756 ret = -EAGAIN;
1757 if (READ_ONCE(ctx->mmap_changing))
1758 goto out;
1759
1760 ret = -EFAULT;
1761 if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1762
1763 sizeof(uffdio_zeropage)-sizeof(__s64)))
1764 goto out;
1765
1766 ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
1767 uffdio_zeropage.range.len);
1768 if (ret)
1769 goto out;
1770 ret = -EINVAL;
1771 if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1772 goto out;
1773
1774 if (mmget_not_zero(ctx->mm)) {
1775 ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
1776 uffdio_zeropage.range.len,
1777 &ctx->mmap_changing);
1778 mmput(ctx->mm);
1779 } else {
1780 return -ESRCH;
1781 }
1782 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1783 return -EFAULT;
1784 if (ret < 0)
1785 goto out;
1786
1787 BUG_ON(!ret);
1788 range.len = ret;
1789 if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1790 range.start = uffdio_zeropage.range.start;
1791 wake_userfault(ctx, &range);
1792 }
1793 ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1794out:
1795 return ret;
1796}
1797
1798static inline unsigned int uffd_ctx_features(__u64 user_features)
1799{
1800
1801
1802
1803 return (unsigned int)user_features;
1804}
1805
1806
1807
1808
1809
1810
1811static int userfaultfd_api(struct userfaultfd_ctx *ctx,
1812 unsigned long arg)
1813{
1814 struct uffdio_api uffdio_api;
1815 void __user *buf = (void __user *)arg;
1816 int ret;
1817 __u64 features;
1818
1819 ret = -EINVAL;
1820 if (ctx->state != UFFD_STATE_WAIT_API)
1821 goto out;
1822 ret = -EFAULT;
1823 if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
1824 goto out;
1825 features = uffdio_api.features;
1826 if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
1827 memset(&uffdio_api, 0, sizeof(uffdio_api));
1828 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
1829 goto out;
1830 ret = -EINVAL;
1831 goto out;
1832 }
1833
1834 uffdio_api.features = UFFD_API_FEATURES;
1835 uffdio_api.ioctls = UFFD_API_IOCTLS;
1836 ret = -EFAULT;
1837 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
1838 goto out;
1839 ctx->state = UFFD_STATE_RUNNING;
1840
1841 ctx->features = uffd_ctx_features(features);
1842 ret = 0;
1843out:
1844 return ret;
1845}
1846
1847static long userfaultfd_ioctl(struct file *file, unsigned cmd,
1848 unsigned long arg)
1849{
1850 int ret = -EINVAL;
1851 struct userfaultfd_ctx *ctx = file->private_data;
1852
1853 if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
1854 return -EINVAL;
1855
1856 switch(cmd) {
1857 case UFFDIO_API:
1858 ret = userfaultfd_api(ctx, arg);
1859 break;
1860 case UFFDIO_REGISTER:
1861 ret = userfaultfd_register(ctx, arg);
1862 break;
1863 case UFFDIO_UNREGISTER:
1864 ret = userfaultfd_unregister(ctx, arg);
1865 break;
1866 case UFFDIO_WAKE:
1867 ret = userfaultfd_wake(ctx, arg);
1868 break;
1869 case UFFDIO_COPY:
1870 ret = userfaultfd_copy(ctx, arg);
1871 break;
1872 case UFFDIO_ZEROPAGE:
1873 ret = userfaultfd_zeropage(ctx, arg);
1874 break;
1875 }
1876 return ret;
1877}
1878
1879#ifdef CONFIG_PROC_FS
1880static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
1881{
1882 struct userfaultfd_ctx *ctx = f->private_data;
1883 wait_queue_entry_t *wq;
1884 unsigned long pending = 0, total = 0;
1885
1886 spin_lock(&ctx->fault_pending_wqh.lock);
1887 list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
1888 pending++;
1889 total++;
1890 }
1891 list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
1892 total++;
1893 }
1894 spin_unlock(&ctx->fault_pending_wqh.lock);
1895
1896
1897
1898
1899
1900
1901 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
1902 pending, total, UFFD_API, ctx->features,
1903 UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
1904}
1905#endif
1906
1907static const struct file_operations userfaultfd_fops = {
1908#ifdef CONFIG_PROC_FS
1909 .show_fdinfo = userfaultfd_show_fdinfo,
1910#endif
1911 .release = userfaultfd_release,
1912 .poll = userfaultfd_poll,
1913 .read = userfaultfd_read,
1914 .unlocked_ioctl = userfaultfd_ioctl,
1915 .compat_ioctl = userfaultfd_ioctl,
1916 .llseek = noop_llseek,
1917};
1918
1919static void init_once_userfaultfd_ctx(void *mem)
1920{
1921 struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
1922
1923 init_waitqueue_head(&ctx->fault_pending_wqh);
1924 init_waitqueue_head(&ctx->fault_wqh);
1925 init_waitqueue_head(&ctx->event_wqh);
1926 init_waitqueue_head(&ctx->fd_wqh);
1927 seqcount_init(&ctx->refile_seq);
1928}
1929
1930SYSCALL_DEFINE1(userfaultfd, int, flags)
1931{
1932 struct userfaultfd_ctx *ctx;
1933 int fd;
1934
1935 BUG_ON(!current->mm);
1936
1937
1938 BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
1939 BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
1940
1941 if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
1942 return -EINVAL;
1943
1944 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
1945 if (!ctx)
1946 return -ENOMEM;
1947
1948 atomic_set(&ctx->refcount, 1);
1949 ctx->flags = flags;
1950 ctx->features = 0;
1951 ctx->state = UFFD_STATE_WAIT_API;
1952 ctx->released = false;
1953 ctx->mmap_changing = false;
1954 ctx->mm = current->mm;
1955
1956 mmgrab(ctx->mm);
1957
1958 fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
1959 O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
1960 if (fd < 0) {
1961 mmdrop(ctx->mm);
1962 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
1963 }
1964 return fd;
1965}
1966
1967static int __init userfaultfd_init(void)
1968{
1969 userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
1970 sizeof(struct userfaultfd_ctx),
1971 0,
1972 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1973 init_once_userfaultfd_ctx);
1974 return 0;
1975}
1976__initcall(userfaultfd_init);
1977