1
2
3
4
5
6
7
8
9#include <linux/init.h>
10#include <linux/kernel.h>
11#include <linux/sched/signal.h>
12#include <linux/fs.h>
13#include <linux/file.h>
14#include <linux/signal.h>
15#include <linux/errno.h>
16#include <linux/mm.h>
17#include <linux/slab.h>
18#include <linux/poll.h>
19#include <linux/string.h>
20#include <linux/list.h>
21#include <linux/hash.h>
22#include <linux/spinlock.h>
23#include <linux/syscalls.h>
24#include <linux/rbtree.h>
25#include <linux/wait.h>
26#include <linux/eventpoll.h>
27#include <linux/mount.h>
28#include <linux/bitops.h>
29#include <linux/mutex.h>
30#include <linux/anon_inodes.h>
31#include <linux/device.h>
32#include <linux/uaccess.h>
33#include <asm/io.h>
34#include <asm/mman.h>
35#include <linux/atomic.h>
36#include <linux/proc_fs.h>
37#include <linux/seq_file.h>
38#include <linux/compat.h>
39#include <linux/rculist.h>
40#include <net/busy_poll.h>
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
92
93#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
94
95#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
96 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
97
98
99#define EP_MAX_NESTS 4
100
101#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
102
103#define EP_UNACTIVE_PTR ((void *) -1L)
104
105#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
106
107struct epoll_filefd {
108 struct file *file;
109 int fd;
110} __packed;
111
112
113
114
115
116struct nested_call_node {
117 struct list_head llink;
118 void *cookie;
119 void *ctx;
120};
121
122
123
124
125
126struct nested_calls {
127 struct list_head tasks_call_list;
128 spinlock_t lock;
129};
130
131
132
133
134
135
136
137struct epitem {
138 union {
139
140 struct rb_node rbn;
141
142 struct rcu_head rcu;
143 };
144
145
146 struct list_head rdllink;
147
148
149
150
151
152 struct epitem *next;
153
154
155 struct epoll_filefd ffd;
156
157
158 int nwait;
159
160
161 struct list_head pwqlist;
162
163
164 struct eventpoll *ep;
165
166
167 struct list_head fllink;
168
169
170 struct wakeup_source __rcu *ws;
171
172
173 struct epoll_event event;
174};
175
176
177
178
179
180
181struct eventpoll {
182
183
184
185
186
187
188 struct mutex mtx;
189
190
191 wait_queue_head_t wq;
192
193
194 wait_queue_head_t poll_wait;
195
196
197 struct list_head rdllist;
198
199
200 rwlock_t lock;
201
202
203 struct rb_root_cached rbr;
204
205
206
207
208
209
210 struct epitem *ovflist;
211
212
213 struct wakeup_source *ws;
214
215
216 struct user_struct *user;
217
218 struct file *file;
219
220
221 u64 gen;
222
223#ifdef CONFIG_NET_RX_BUSY_POLL
224
225 unsigned int napi_id;
226#endif
227
228#ifdef CONFIG_DEBUG_LOCK_ALLOC
229
230 u8 nests;
231#endif
232};
233
234
235struct eppoll_entry {
236
237 struct list_head llink;
238
239
240 struct epitem *base;
241
242
243
244
245
246 wait_queue_entry_t wait;
247
248
249 wait_queue_head_t *whead;
250};
251
252
253struct ep_pqueue {
254 poll_table pt;
255 struct epitem *epi;
256};
257
258
259struct ep_send_events_data {
260 int maxevents;
261 struct epoll_event __user *events;
262 int res;
263};
264
265
266
267
268
269static long max_user_watches __read_mostly;
270
271
272
273
274static DEFINE_MUTEX(epmutex);
275
276static u64 loop_check_gen = 0;
277
278
279static struct nested_calls poll_loop_ncalls;
280
281
282static struct kmem_cache *epi_cache __read_mostly;
283
284
285static struct kmem_cache *pwq_cache __read_mostly;
286
287
288
289
290
291static LIST_HEAD(tfile_check_list);
292
293#ifdef CONFIG_SYSCTL
294
295#include <linux/sysctl.h>
296
297static long long_zero;
298static long long_max = LONG_MAX;
299
300struct ctl_table epoll_table[] = {
301 {
302 .procname = "max_user_watches",
303 .data = &max_user_watches,
304 .maxlen = sizeof(max_user_watches),
305 .mode = 0644,
306 .proc_handler = proc_doulongvec_minmax,
307 .extra1 = &long_zero,
308 .extra2 = &long_max,
309 },
310 { }
311};
312#endif
313
314static const struct file_operations eventpoll_fops;
315
316static inline int is_file_epoll(struct file *f)
317{
318 return f->f_op == &eventpoll_fops;
319}
320
321
322static inline void ep_set_ffd(struct epoll_filefd *ffd,
323 struct file *file, int fd)
324{
325 ffd->file = file;
326 ffd->fd = fd;
327}
328
329
330static inline int ep_cmp_ffd(struct epoll_filefd *p1,
331 struct epoll_filefd *p2)
332{
333 return (p1->file > p2->file ? +1:
334 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
335}
336
337
338static inline int ep_is_linked(struct epitem *epi)
339{
340 return !list_empty(&epi->rdllink);
341}
342
343static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
344{
345 return container_of(p, struct eppoll_entry, wait);
346}
347
348
349static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
350{
351 return container_of(p, struct eppoll_entry, wait)->base;
352}
353
354
355static inline struct epitem *ep_item_from_epqueue(poll_table *p)
356{
357 return container_of(p, struct ep_pqueue, pt)->epi;
358}
359
360
361static void ep_nested_calls_init(struct nested_calls *ncalls)
362{
363 INIT_LIST_HEAD(&ncalls->tasks_call_list);
364 spin_lock_init(&ncalls->lock);
365}
366
367
368
369
370
371
372
373
374
375static inline int ep_events_available(struct eventpoll *ep)
376{
377 return !list_empty_careful(&ep->rdllist) ||
378 READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
379}
380
381#ifdef CONFIG_NET_RX_BUSY_POLL
382static bool ep_busy_loop_end(void *p, unsigned long start_time)
383{
384 struct eventpoll *ep = p;
385
386 return ep_events_available(ep) || busy_loop_timeout(start_time);
387}
388
389
390
391
392
393
394
395static void ep_busy_loop(struct eventpoll *ep, int nonblock)
396{
397 unsigned int napi_id = READ_ONCE(ep->napi_id);
398
399 if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
400 napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
401}
402
403static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
404{
405 if (ep->napi_id)
406 ep->napi_id = 0;
407}
408
409
410
411
412static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
413{
414 struct eventpoll *ep;
415 unsigned int napi_id;
416 struct socket *sock;
417 struct sock *sk;
418 int err;
419
420 if (!net_busy_loop_on())
421 return;
422
423 sock = sock_from_file(epi->ffd.file, &err);
424 if (!sock)
425 return;
426
427 sk = sock->sk;
428 if (!sk)
429 return;
430
431 napi_id = READ_ONCE(sk->sk_napi_id);
432 ep = epi->ep;
433
434
435
436
437
438 if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
439 return;
440
441
442 ep->napi_id = napi_id;
443}
444
445#else
446
447static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
448{
449}
450
451static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
452{
453}
454
455static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
456{
457}
458
459#endif
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476static int ep_call_nested(struct nested_calls *ncalls,
477 int (*nproc)(void *, void *, int), void *priv,
478 void *cookie, void *ctx)
479{
480 int error, call_nests = 0;
481 unsigned long flags;
482 struct list_head *lsthead = &ncalls->tasks_call_list;
483 struct nested_call_node *tncur;
484 struct nested_call_node tnode;
485
486 spin_lock_irqsave(&ncalls->lock, flags);
487
488
489
490
491
492
493 list_for_each_entry(tncur, lsthead, llink) {
494 if (tncur->ctx == ctx &&
495 (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
496
497
498
499
500 error = -1;
501 goto out_unlock;
502 }
503 }
504
505
506 tnode.ctx = ctx;
507 tnode.cookie = cookie;
508 list_add(&tnode.llink, lsthead);
509
510 spin_unlock_irqrestore(&ncalls->lock, flags);
511
512
513 error = (*nproc)(priv, cookie, call_nests);
514
515
516 spin_lock_irqsave(&ncalls->lock, flags);
517 list_del(&tnode.llink);
518out_unlock:
519 spin_unlock_irqrestore(&ncalls->lock, flags);
520
521 return error;
522}
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549#ifdef CONFIG_DEBUG_LOCK_ALLOC
550
551static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
552{
553 struct eventpoll *ep_src;
554 unsigned long flags;
555 u8 nests = 0;
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572 if (epi) {
573 if ((is_file_epoll(epi->ffd.file))) {
574 ep_src = epi->ffd.file->private_data;
575 nests = ep_src->nests;
576 } else {
577 nests = 1;
578 }
579 }
580 spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
581 ep->nests = nests + 1;
582 wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
583 ep->nests = 0;
584 spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
585}
586
587#else
588
589static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
590{
591 wake_up_poll(&ep->poll_wait, EPOLLIN);
592}
593
594#endif
595
596static void ep_remove_wait_queue(struct eppoll_entry *pwq)
597{
598 wait_queue_head_t *whead;
599
600 rcu_read_lock();
601
602
603
604
605
606
607 whead = smp_load_acquire(&pwq->whead);
608 if (whead)
609 remove_wait_queue(whead, &pwq->wait);
610 rcu_read_unlock();
611}
612
613
614
615
616
617
618static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
619{
620 struct list_head *lsthead = &epi->pwqlist;
621 struct eppoll_entry *pwq;
622
623 while (!list_empty(lsthead)) {
624 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
625
626 list_del(&pwq->llink);
627 ep_remove_wait_queue(pwq);
628 kmem_cache_free(pwq_cache, pwq);
629 }
630}
631
632
633static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
634{
635 return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
636}
637
638
639static inline void ep_pm_stay_awake(struct epitem *epi)
640{
641 struct wakeup_source *ws = ep_wakeup_source(epi);
642
643 if (ws)
644 __pm_stay_awake(ws);
645}
646
647static inline bool ep_has_wakeup_source(struct epitem *epi)
648{
649 return rcu_access_pointer(epi->ws) ? true : false;
650}
651
652
653static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
654{
655 struct wakeup_source *ws;
656
657 rcu_read_lock();
658 ws = rcu_dereference(epi->ws);
659 if (ws)
660 __pm_stay_awake(ws);
661 rcu_read_unlock();
662}
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677static __poll_t ep_scan_ready_list(struct eventpoll *ep,
678 __poll_t (*sproc)(struct eventpoll *,
679 struct list_head *, void *),
680 void *priv, int depth, bool ep_locked)
681{
682 __poll_t res;
683 struct epitem *epi, *nepi;
684 LIST_HEAD(txlist);
685
686 lockdep_assert_irqs_enabled();
687
688
689
690
691
692
693 if (!ep_locked)
694 mutex_lock_nested(&ep->mtx, depth);
695
696
697
698
699
700
701
702
703
704 write_lock_irq(&ep->lock);
705 list_splice_init(&ep->rdllist, &txlist);
706 WRITE_ONCE(ep->ovflist, NULL);
707 write_unlock_irq(&ep->lock);
708
709
710
711
712 res = (*sproc)(ep, &txlist, priv);
713
714 write_lock_irq(&ep->lock);
715
716
717
718
719
720 for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
721 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
722
723
724
725
726
727
728 if (!ep_is_linked(epi)) {
729
730
731
732
733 list_add(&epi->rdllink, &ep->rdllist);
734 ep_pm_stay_awake(epi);
735 }
736 }
737
738
739
740
741
742 WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
743
744
745
746
747 list_splice(&txlist, &ep->rdllist);
748 __pm_relax(ep->ws);
749 write_unlock_irq(&ep->lock);
750
751 if (!ep_locked)
752 mutex_unlock(&ep->mtx);
753
754 return res;
755}
756
757static void epi_rcu_free(struct rcu_head *head)
758{
759 struct epitem *epi = container_of(head, struct epitem, rcu);
760 kmem_cache_free(epi_cache, epi);
761}
762
763
764
765
766
767static int ep_remove(struct eventpoll *ep, struct epitem *epi)
768{
769 struct file *file = epi->ffd.file;
770
771 lockdep_assert_irqs_enabled();
772
773
774
775
776 ep_unregister_pollwait(ep, epi);
777
778
779 spin_lock(&file->f_lock);
780 list_del_rcu(&epi->fllink);
781 spin_unlock(&file->f_lock);
782
783 rb_erase_cached(&epi->rbn, &ep->rbr);
784
785 write_lock_irq(&ep->lock);
786 if (ep_is_linked(epi))
787 list_del_init(&epi->rdllink);
788 write_unlock_irq(&ep->lock);
789
790 wakeup_source_unregister(ep_wakeup_source(epi));
791
792
793
794
795
796
797
798 call_rcu(&epi->rcu, epi_rcu_free);
799
800 atomic_long_dec(&ep->user->epoll_watches);
801
802 return 0;
803}
804
805static void ep_free(struct eventpoll *ep)
806{
807 struct rb_node *rbp;
808 struct epitem *epi;
809
810
811 if (waitqueue_active(&ep->poll_wait))
812 ep_poll_safewake(ep, NULL);
813
814
815
816
817
818
819
820
821
822 mutex_lock(&epmutex);
823
824
825
826
827 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
828 epi = rb_entry(rbp, struct epitem, rbn);
829
830 ep_unregister_pollwait(ep, epi);
831 cond_resched();
832 }
833
834
835
836
837
838
839
840
841
842 mutex_lock(&ep->mtx);
843 while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
844 epi = rb_entry(rbp, struct epitem, rbn);
845 ep_remove(ep, epi);
846 cond_resched();
847 }
848 mutex_unlock(&ep->mtx);
849
850 mutex_unlock(&epmutex);
851 mutex_destroy(&ep->mtx);
852 free_uid(ep->user);
853 wakeup_source_unregister(ep->ws);
854 kfree(ep);
855}
856
857static int ep_eventpoll_release(struct inode *inode, struct file *file)
858{
859 struct eventpoll *ep = file->private_data;
860
861 if (ep)
862 ep_free(ep);
863
864 return 0;
865}
866
867static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
868 void *priv);
869static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
870 poll_table *pt);
871
872
873
874
875
876
877static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
878 int depth)
879{
880 struct eventpoll *ep;
881 bool locked;
882
883 pt->_key = epi->event.events;
884 if (!is_file_epoll(epi->ffd.file))
885 return vfs_poll(epi->ffd.file, pt) & epi->event.events;
886
887 ep = epi->ffd.file->private_data;
888 poll_wait(epi->ffd.file, &ep->poll_wait, pt);
889 locked = pt && (pt->_qproc == ep_ptable_queue_proc);
890
891 return ep_scan_ready_list(epi->ffd.file->private_data,
892 ep_read_events_proc, &depth, depth,
893 locked) & epi->event.events;
894}
895
896static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
897 void *priv)
898{
899 struct epitem *epi, *tmp;
900 poll_table pt;
901 int depth = *(int *)priv;
902
903 init_poll_funcptr(&pt, NULL);
904 depth++;
905
906 list_for_each_entry_safe(epi, tmp, head, rdllink) {
907 if (ep_item_poll(epi, &pt, depth)) {
908 return EPOLLIN | EPOLLRDNORM;
909 } else {
910
911
912
913
914
915 __pm_relax(ep_wakeup_source(epi));
916 list_del_init(&epi->rdllink);
917 }
918 }
919
920 return 0;
921}
922
923static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
924{
925 struct eventpoll *ep = file->private_data;
926 int depth = 0;
927
928
929 poll_wait(file, &ep->poll_wait, wait);
930
931
932
933
934
935 return ep_scan_ready_list(ep, ep_read_events_proc,
936 &depth, depth, false);
937}
938
939#ifdef CONFIG_PROC_FS
940static void ep_show_fdinfo(struct seq_file *m, struct file *f)
941{
942 struct eventpoll *ep = f->private_data;
943 struct rb_node *rbp;
944
945 mutex_lock(&ep->mtx);
946 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
947 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
948 struct inode *inode = file_inode(epi->ffd.file);
949
950 seq_printf(m, "tfd: %8d events: %8x data: %16llx "
951 " pos:%lli ino:%lx sdev:%x\n",
952 epi->ffd.fd, epi->event.events,
953 (long long)epi->event.data,
954 (long long)epi->ffd.file->f_pos,
955 inode->i_ino, inode->i_sb->s_dev);
956 if (seq_has_overflowed(m))
957 break;
958 }
959 mutex_unlock(&ep->mtx);
960}
961#endif
962
963
964static const struct file_operations eventpoll_fops = {
965#ifdef CONFIG_PROC_FS
966 .show_fdinfo = ep_show_fdinfo,
967#endif
968 .release = ep_eventpoll_release,
969 .poll = ep_eventpoll_poll,
970 .llseek = noop_llseek,
971};
972
973
974
975
976
977
978void eventpoll_release_file(struct file *file)
979{
980 struct eventpoll *ep;
981 struct epitem *epi, *next;
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996 mutex_lock(&epmutex);
997 list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
998 ep = epi->ep;
999 mutex_lock_nested(&ep->mtx, 0);
1000 ep_remove(ep, epi);
1001 mutex_unlock(&ep->mtx);
1002 }
1003 mutex_unlock(&epmutex);
1004}
1005
1006static int ep_alloc(struct eventpoll **pep)
1007{
1008 int error;
1009 struct user_struct *user;
1010 struct eventpoll *ep;
1011
1012 user = get_current_user();
1013 error = -ENOMEM;
1014 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
1015 if (unlikely(!ep))
1016 goto free_uid;
1017
1018 mutex_init(&ep->mtx);
1019 rwlock_init(&ep->lock);
1020 init_waitqueue_head(&ep->wq);
1021 init_waitqueue_head(&ep->poll_wait);
1022 INIT_LIST_HEAD(&ep->rdllist);
1023 ep->rbr = RB_ROOT_CACHED;
1024 ep->ovflist = EP_UNACTIVE_PTR;
1025 ep->user = user;
1026
1027 *pep = ep;
1028
1029 return 0;
1030
1031free_uid:
1032 free_uid(user);
1033 return error;
1034}
1035
1036
1037
1038
1039
1040
1041static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
1042{
1043 int kcmp;
1044 struct rb_node *rbp;
1045 struct epitem *epi, *epir = NULL;
1046 struct epoll_filefd ffd;
1047
1048 ep_set_ffd(&ffd, file, fd);
1049 for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
1050 epi = rb_entry(rbp, struct epitem, rbn);
1051 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
1052 if (kcmp > 0)
1053 rbp = rbp->rb_right;
1054 else if (kcmp < 0)
1055 rbp = rbp->rb_left;
1056 else {
1057 epir = epi;
1058 break;
1059 }
1060 }
1061
1062 return epir;
1063}
1064
1065#ifdef CONFIG_CHECKPOINT_RESTORE
1066static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
1067{
1068 struct rb_node *rbp;
1069 struct epitem *epi;
1070
1071 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1072 epi = rb_entry(rbp, struct epitem, rbn);
1073 if (epi->ffd.fd == tfd) {
1074 if (toff == 0)
1075 return epi;
1076 else
1077 toff--;
1078 }
1079 cond_resched();
1080 }
1081
1082 return NULL;
1083}
1084
1085struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
1086 unsigned long toff)
1087{
1088 struct file *file_raw;
1089 struct eventpoll *ep;
1090 struct epitem *epi;
1091
1092 if (!is_file_epoll(file))
1093 return ERR_PTR(-EINVAL);
1094
1095 ep = file->private_data;
1096
1097 mutex_lock(&ep->mtx);
1098 epi = ep_find_tfd(ep, tfd, toff);
1099 if (epi)
1100 file_raw = epi->ffd.file;
1101 else
1102 file_raw = ERR_PTR(-ENOENT);
1103 mutex_unlock(&ep->mtx);
1104
1105 return file_raw;
1106}
1107#endif
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127static inline bool list_add_tail_lockless(struct list_head *new,
1128 struct list_head *head)
1129{
1130 struct list_head *prev;
1131
1132
1133
1134
1135
1136
1137
1138 if (cmpxchg(&new->next, new, head) != new)
1139 return false;
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149 prev = xchg(&head->prev, new);
1150
1151
1152
1153
1154
1155
1156 prev->next = new;
1157 new->prev = prev;
1158
1159 return true;
1160}
1161
1162
1163
1164
1165
1166
1167
1168static inline bool chain_epi_lockless(struct epitem *epi)
1169{
1170 struct eventpoll *ep = epi->ep;
1171
1172
1173 if (epi->next != EP_UNACTIVE_PTR)
1174 return false;
1175
1176
1177 if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
1178 return false;
1179
1180
1181 epi->next = xchg(&ep->ovflist, epi);
1182
1183 return true;
1184}
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
1205{
1206 int pwake = 0;
1207 struct epitem *epi = ep_item_from_wait(wait);
1208 struct eventpoll *ep = epi->ep;
1209 __poll_t pollflags = key_to_poll(key);
1210 unsigned long flags;
1211 int ewake = 0;
1212
1213 read_lock_irqsave(&ep->lock, flags);
1214
1215 ep_set_busy_poll_napi_id(epi);
1216
1217
1218
1219
1220
1221
1222
1223 if (!(epi->event.events & ~EP_PRIVATE_BITS))
1224 goto out_unlock;
1225
1226
1227
1228
1229
1230
1231
1232 if (pollflags && !(pollflags & epi->event.events))
1233 goto out_unlock;
1234
1235
1236
1237
1238
1239
1240
1241 if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
1242 if (chain_epi_lockless(epi))
1243 ep_pm_stay_awake_rcu(epi);
1244 } else if (!ep_is_linked(epi)) {
1245
1246 if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
1247 ep_pm_stay_awake_rcu(epi);
1248 }
1249
1250
1251
1252
1253
1254 if (waitqueue_active(&ep->wq)) {
1255 if ((epi->event.events & EPOLLEXCLUSIVE) &&
1256 !(pollflags & POLLFREE)) {
1257 switch (pollflags & EPOLLINOUT_BITS) {
1258 case EPOLLIN:
1259 if (epi->event.events & EPOLLIN)
1260 ewake = 1;
1261 break;
1262 case EPOLLOUT:
1263 if (epi->event.events & EPOLLOUT)
1264 ewake = 1;
1265 break;
1266 case 0:
1267 ewake = 1;
1268 break;
1269 }
1270 }
1271 wake_up(&ep->wq);
1272 }
1273 if (waitqueue_active(&ep->poll_wait))
1274 pwake++;
1275
1276out_unlock:
1277 read_unlock_irqrestore(&ep->lock, flags);
1278
1279
1280 if (pwake)
1281 ep_poll_safewake(ep, epi);
1282
1283 if (!(epi->event.events & EPOLLEXCLUSIVE))
1284 ewake = 1;
1285
1286 if (pollflags & POLLFREE) {
1287
1288
1289
1290
1291
1292 list_del_init(&wait->entry);
1293
1294
1295
1296
1297
1298
1299 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
1300 }
1301
1302 return ewake;
1303}
1304
1305
1306
1307
1308
1309static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
1310 poll_table *pt)
1311{
1312 struct epitem *epi = ep_item_from_epqueue(pt);
1313 struct eppoll_entry *pwq;
1314
1315 if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
1316 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
1317 pwq->whead = whead;
1318 pwq->base = epi;
1319 if (epi->event.events & EPOLLEXCLUSIVE)
1320 add_wait_queue_exclusive(whead, &pwq->wait);
1321 else
1322 add_wait_queue(whead, &pwq->wait);
1323 list_add_tail(&pwq->llink, &epi->pwqlist);
1324 epi->nwait++;
1325 } else {
1326
1327 epi->nwait = -1;
1328 }
1329}
1330
1331static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1332{
1333 int kcmp;
1334 struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
1335 struct epitem *epic;
1336 bool leftmost = true;
1337
1338 while (*p) {
1339 parent = *p;
1340 epic = rb_entry(parent, struct epitem, rbn);
1341 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1342 if (kcmp > 0) {
1343 p = &parent->rb_right;
1344 leftmost = false;
1345 } else
1346 p = &parent->rb_left;
1347 }
1348 rb_link_node(&epi->rbn, parent, p);
1349 rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
1350}
1351
1352
1353
1354#define PATH_ARR_SIZE 5
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
1367static int path_count[PATH_ARR_SIZE];
1368
1369static int path_count_inc(int nests)
1370{
1371
1372 if (nests == 0)
1373 return 0;
1374
1375 if (++path_count[nests] > path_limits[nests])
1376 return -1;
1377 return 0;
1378}
1379
1380static void path_count_init(void)
1381{
1382 int i;
1383
1384 for (i = 0; i < PATH_ARR_SIZE; i++)
1385 path_count[i] = 0;
1386}
1387
1388static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1389{
1390 int error = 0;
1391 struct file *file = priv;
1392 struct file *child_file;
1393 struct epitem *epi;
1394
1395
1396 rcu_read_lock();
1397 list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
1398 child_file = epi->ep->file;
1399 if (is_file_epoll(child_file)) {
1400 if (list_empty(&child_file->f_ep_links)) {
1401 if (path_count_inc(call_nests)) {
1402 error = -1;
1403 break;
1404 }
1405 } else {
1406 error = ep_call_nested(&poll_loop_ncalls,
1407 reverse_path_check_proc,
1408 child_file, child_file,
1409 current);
1410 }
1411 if (error != 0)
1412 break;
1413 } else {
1414 printk(KERN_ERR "reverse_path_check_proc: "
1415 "file is not an ep!\n");
1416 }
1417 }
1418 rcu_read_unlock();
1419 return error;
1420}
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432static int reverse_path_check(void)
1433{
1434 int error = 0;
1435 struct file *current_file;
1436
1437
1438 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1439 path_count_init();
1440 error = ep_call_nested(&poll_loop_ncalls,
1441 reverse_path_check_proc, current_file,
1442 current_file, current);
1443 if (error)
1444 break;
1445 }
1446 return error;
1447}
1448
1449static int ep_create_wakeup_source(struct epitem *epi)
1450{
1451 struct name_snapshot n;
1452 struct wakeup_source *ws;
1453
1454 if (!epi->ep->ws) {
1455 epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
1456 if (!epi->ep->ws)
1457 return -ENOMEM;
1458 }
1459
1460 take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
1461 ws = wakeup_source_register(NULL, n.name.name);
1462 release_dentry_name_snapshot(&n);
1463
1464 if (!ws)
1465 return -ENOMEM;
1466 rcu_assign_pointer(epi->ws, ws);
1467
1468 return 0;
1469}
1470
1471
1472static noinline void ep_destroy_wakeup_source(struct epitem *epi)
1473{
1474 struct wakeup_source *ws = ep_wakeup_source(epi);
1475
1476 RCU_INIT_POINTER(epi->ws, NULL);
1477
1478
1479
1480
1481
1482
1483 synchronize_rcu();
1484 wakeup_source_unregister(ws);
1485}
1486
1487
1488
1489
1490static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
1491 struct file *tfile, int fd, int full_check)
1492{
1493 int error, pwake = 0;
1494 __poll_t revents;
1495 long user_watches;
1496 struct epitem *epi;
1497 struct ep_pqueue epq;
1498
1499 lockdep_assert_irqs_enabled();
1500
1501 user_watches = atomic_long_read(&ep->user->epoll_watches);
1502 if (unlikely(user_watches >= max_user_watches))
1503 return -ENOSPC;
1504 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
1505 return -ENOMEM;
1506
1507
1508 INIT_LIST_HEAD(&epi->rdllink);
1509 INIT_LIST_HEAD(&epi->fllink);
1510 INIT_LIST_HEAD(&epi->pwqlist);
1511 epi->ep = ep;
1512 ep_set_ffd(&epi->ffd, tfile, fd);
1513 epi->event = *event;
1514 epi->nwait = 0;
1515 epi->next = EP_UNACTIVE_PTR;
1516 if (epi->event.events & EPOLLWAKEUP) {
1517 error = ep_create_wakeup_source(epi);
1518 if (error)
1519 goto error_create_wakeup_source;
1520 } else {
1521 RCU_INIT_POINTER(epi->ws, NULL);
1522 }
1523
1524
1525 spin_lock(&tfile->f_lock);
1526 list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
1527 spin_unlock(&tfile->f_lock);
1528
1529
1530
1531
1532
1533 ep_rbtree_insert(ep, epi);
1534
1535
1536 error = -EINVAL;
1537 if (full_check && reverse_path_check())
1538 goto error_remove_epi;
1539
1540
1541 epq.epi = epi;
1542 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1543
1544
1545
1546
1547
1548
1549
1550
1551 revents = ep_item_poll(epi, &epq.pt, 1);
1552
1553
1554
1555
1556
1557
1558 error = -ENOMEM;
1559 if (epi->nwait < 0)
1560 goto error_unregister;
1561
1562
1563 write_lock_irq(&ep->lock);
1564
1565
1566 ep_set_busy_poll_napi_id(epi);
1567
1568
1569 if (revents && !ep_is_linked(epi)) {
1570 list_add_tail(&epi->rdllink, &ep->rdllist);
1571 ep_pm_stay_awake(epi);
1572
1573
1574 if (waitqueue_active(&ep->wq))
1575 wake_up(&ep->wq);
1576 if (waitqueue_active(&ep->poll_wait))
1577 pwake++;
1578 }
1579
1580 write_unlock_irq(&ep->lock);
1581
1582 atomic_long_inc(&ep->user->epoll_watches);
1583
1584
1585 if (pwake)
1586 ep_poll_safewake(ep, NULL);
1587
1588 return 0;
1589
1590error_unregister:
1591 ep_unregister_pollwait(ep, epi);
1592error_remove_epi:
1593 spin_lock(&tfile->f_lock);
1594 list_del_rcu(&epi->fllink);
1595 spin_unlock(&tfile->f_lock);
1596
1597 rb_erase_cached(&epi->rbn, &ep->rbr);
1598
1599
1600
1601
1602
1603
1604
1605 write_lock_irq(&ep->lock);
1606 if (ep_is_linked(epi))
1607 list_del_init(&epi->rdllink);
1608 write_unlock_irq(&ep->lock);
1609
1610 wakeup_source_unregister(ep_wakeup_source(epi));
1611
1612error_create_wakeup_source:
1613 kmem_cache_free(epi_cache, epi);
1614
1615 return error;
1616}
1617
1618
1619
1620
1621
1622static int ep_modify(struct eventpoll *ep, struct epitem *epi,
1623 const struct epoll_event *event)
1624{
1625 int pwake = 0;
1626 poll_table pt;
1627
1628 lockdep_assert_irqs_enabled();
1629
1630 init_poll_funcptr(&pt, NULL);
1631
1632
1633
1634
1635
1636
1637 epi->event.events = event->events;
1638 epi->event.data = event->data;
1639 if (epi->event.events & EPOLLWAKEUP) {
1640 if (!ep_has_wakeup_source(epi))
1641 ep_create_wakeup_source(epi);
1642 } else if (ep_has_wakeup_source(epi)) {
1643 ep_destroy_wakeup_source(epi);
1644 }
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664 smp_mb();
1665
1666
1667
1668
1669
1670
1671
1672 if (ep_item_poll(epi, &pt, 1)) {
1673 write_lock_irq(&ep->lock);
1674 if (!ep_is_linked(epi)) {
1675 list_add_tail(&epi->rdllink, &ep->rdllist);
1676 ep_pm_stay_awake(epi);
1677
1678
1679 if (waitqueue_active(&ep->wq))
1680 wake_up(&ep->wq);
1681 if (waitqueue_active(&ep->poll_wait))
1682 pwake++;
1683 }
1684 write_unlock_irq(&ep->lock);
1685 }
1686
1687
1688 if (pwake)
1689 ep_poll_safewake(ep, NULL);
1690
1691 return 0;
1692}
1693
1694static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1695 void *priv)
1696{
1697 struct ep_send_events_data *esed = priv;
1698 __poll_t revents;
1699 struct epitem *epi, *tmp;
1700 struct epoll_event __user *uevent = esed->events;
1701 struct wakeup_source *ws;
1702 poll_table pt;
1703
1704 init_poll_funcptr(&pt, NULL);
1705 esed->res = 0;
1706
1707
1708
1709
1710
1711
1712 lockdep_assert_held(&ep->mtx);
1713
1714 list_for_each_entry_safe(epi, tmp, head, rdllink) {
1715 if (esed->res >= esed->maxevents)
1716 break;
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727 ws = ep_wakeup_source(epi);
1728 if (ws) {
1729 if (ws->active)
1730 __pm_stay_awake(ep->ws);
1731 __pm_relax(ws);
1732 }
1733
1734 list_del_init(&epi->rdllink);
1735
1736
1737
1738
1739
1740
1741
1742 revents = ep_item_poll(epi, &pt, 1);
1743 if (!revents)
1744 continue;
1745
1746 if (__put_user(revents, &uevent->events) ||
1747 __put_user(epi->event.data, &uevent->data)) {
1748 list_add(&epi->rdllink, head);
1749 ep_pm_stay_awake(epi);
1750 if (!esed->res)
1751 esed->res = -EFAULT;
1752 return 0;
1753 }
1754 esed->res++;
1755 uevent++;
1756 if (epi->event.events & EPOLLONESHOT)
1757 epi->event.events &= EP_PRIVATE_BITS;
1758 else if (!(epi->event.events & EPOLLET)) {
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770 list_add_tail(&epi->rdllink, &ep->rdllist);
1771 ep_pm_stay_awake(epi);
1772 }
1773 }
1774
1775 return 0;
1776}
1777
1778static int ep_send_events(struct eventpoll *ep,
1779 struct epoll_event __user *events, int maxevents)
1780{
1781 struct ep_send_events_data esed;
1782
1783 esed.maxevents = maxevents;
1784 esed.events = events;
1785
1786 ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
1787 return esed.res;
1788}
1789
1790static inline struct timespec64 ep_set_mstimeout(long ms)
1791{
1792 struct timespec64 now, ts = {
1793 .tv_sec = ms / MSEC_PER_SEC,
1794 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1795 };
1796
1797 ktime_get_ts64(&now);
1798 return timespec64_add_safe(now, ts);
1799}
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1819 int maxevents, long timeout)
1820{
1821 int res = 0, eavail, timed_out = 0;
1822 u64 slack = 0;
1823 wait_queue_entry_t wait;
1824 ktime_t expires, *to = NULL;
1825
1826 lockdep_assert_irqs_enabled();
1827
1828 if (timeout > 0) {
1829 struct timespec64 end_time = ep_set_mstimeout(timeout);
1830
1831 slack = select_estimate_accuracy(&end_time);
1832 to = &expires;
1833 *to = timespec64_to_ktime(end_time);
1834 } else if (timeout == 0) {
1835
1836
1837
1838
1839
1840
1841
1842 timed_out = 1;
1843
1844 write_lock_irq(&ep->lock);
1845 eavail = ep_events_available(ep);
1846 write_unlock_irq(&ep->lock);
1847
1848 goto send_events;
1849 }
1850
1851fetch_events:
1852
1853 if (!ep_events_available(ep))
1854 ep_busy_loop(ep, timed_out);
1855
1856 eavail = ep_events_available(ep);
1857 if (eavail)
1858 goto send_events;
1859
1860
1861
1862
1863
1864
1865 ep_reset_busy_poll_napi_id(ep);
1866
1867 do {
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879 init_wait(&wait);
1880
1881 write_lock_irq(&ep->lock);
1882
1883
1884
1885
1886
1887 __set_current_state(TASK_INTERRUPTIBLE);
1888
1889
1890
1891
1892
1893
1894
1895
1896 eavail = ep_events_available(ep);
1897 if (!eavail) {
1898 if (signal_pending(current))
1899 res = -EINTR;
1900 else
1901 __add_wait_queue_exclusive(&ep->wq, &wait);
1902 }
1903 write_unlock_irq(&ep->lock);
1904
1905 if (eavail || res)
1906 break;
1907
1908 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
1909 timed_out = 1;
1910 break;
1911 }
1912
1913
1914 eavail = 1;
1915
1916 } while (0);
1917
1918 __set_current_state(TASK_RUNNING);
1919
1920 if (!list_empty_careful(&wait.entry)) {
1921 write_lock_irq(&ep->lock);
1922 __remove_wait_queue(&ep->wq, &wait);
1923 write_unlock_irq(&ep->lock);
1924 }
1925
1926send_events:
1927 if (fatal_signal_pending(current)) {
1928
1929
1930
1931
1932
1933
1934 res = -EINTR;
1935 }
1936
1937
1938
1939
1940
1941 if (!res && eavail &&
1942 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1943 goto fetch_events;
1944
1945 return res;
1946}
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1964{
1965 int error = 0;
1966 struct file *file = priv;
1967 struct eventpoll *ep = file->private_data;
1968 struct eventpoll *ep_tovisit;
1969 struct rb_node *rbp;
1970 struct epitem *epi;
1971
1972 mutex_lock_nested(&ep->mtx, call_nests + 1);
1973 ep->gen = loop_check_gen;
1974 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1975 epi = rb_entry(rbp, struct epitem, rbn);
1976 if (unlikely(is_file_epoll(epi->ffd.file))) {
1977 ep_tovisit = epi->ffd.file->private_data;
1978 if (ep_tovisit->gen == loop_check_gen)
1979 continue;
1980 error = ep_call_nested(&poll_loop_ncalls,
1981 ep_loop_check_proc, epi->ffd.file,
1982 ep_tovisit, current);
1983 if (error != 0)
1984 break;
1985 } else {
1986
1987
1988
1989
1990
1991
1992
1993
1994 if (list_empty(&epi->ffd.file->f_tfile_llink)) {
1995 if (get_file_rcu(epi->ffd.file))
1996 list_add(&epi->ffd.file->f_tfile_llink,
1997 &tfile_check_list);
1998 }
1999 }
2000 }
2001 mutex_unlock(&ep->mtx);
2002
2003 return error;
2004}
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017static int ep_loop_check(struct eventpoll *ep, struct file *file)
2018{
2019 return ep_call_nested(&poll_loop_ncalls,
2020 ep_loop_check_proc, file, ep, current);
2021}
2022
2023static void clear_tfile_check_list(void)
2024{
2025 struct file *file;
2026
2027
2028 while (!list_empty(&tfile_check_list)) {
2029 file = list_first_entry(&tfile_check_list, struct file,
2030 f_tfile_llink);
2031 list_del_init(&file->f_tfile_llink);
2032 fput(file);
2033 }
2034 INIT_LIST_HEAD(&tfile_check_list);
2035}
2036
2037
2038
2039
2040static int do_epoll_create(int flags)
2041{
2042 int error, fd;
2043 struct eventpoll *ep = NULL;
2044 struct file *file;
2045
2046
2047 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
2048
2049 if (flags & ~EPOLL_CLOEXEC)
2050 return -EINVAL;
2051
2052
2053
2054 error = ep_alloc(&ep);
2055 if (error < 0)
2056 return error;
2057
2058
2059
2060
2061 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
2062 if (fd < 0) {
2063 error = fd;
2064 goto out_free_ep;
2065 }
2066 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
2067 O_RDWR | (flags & O_CLOEXEC));
2068 if (IS_ERR(file)) {
2069 error = PTR_ERR(file);
2070 goto out_free_fd;
2071 }
2072 ep->file = file;
2073 fd_install(fd, file);
2074 return fd;
2075
2076out_free_fd:
2077 put_unused_fd(fd);
2078out_free_ep:
2079 ep_free(ep);
2080 return error;
2081}
2082
2083SYSCALL_DEFINE1(epoll_create1, int, flags)
2084{
2085 return do_epoll_create(flags);
2086}
2087
2088SYSCALL_DEFINE1(epoll_create, int, size)
2089{
2090 if (size <= 0)
2091 return -EINVAL;
2092
2093 return do_epoll_create(0);
2094}
2095
2096static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
2097 bool nonblock)
2098{
2099 if (!nonblock) {
2100 mutex_lock_nested(mutex, depth);
2101 return 0;
2102 }
2103 if (mutex_trylock(mutex))
2104 return 0;
2105 return -EAGAIN;
2106}
2107
2108int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
2109 bool nonblock)
2110{
2111 int error;
2112 int full_check = 0;
2113 struct fd f, tf;
2114 struct eventpoll *ep;
2115 struct epitem *epi;
2116 struct eventpoll *tep = NULL;
2117
2118 error = -EBADF;
2119 f = fdget(epfd);
2120 if (!f.file)
2121 goto error_return;
2122
2123
2124 tf = fdget(fd);
2125 if (!tf.file)
2126 goto error_fput;
2127
2128
2129 error = -EPERM;
2130 if (!file_can_poll(tf.file))
2131 goto error_tgt_fput;
2132
2133
2134 if (ep_op_has_event(op))
2135 ep_take_care_of_epollwakeup(epds);
2136
2137
2138
2139
2140
2141
2142 error = -EINVAL;
2143 if (f.file == tf.file || !is_file_epoll(f.file))
2144 goto error_tgt_fput;
2145
2146
2147
2148
2149
2150
2151 if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
2152 if (op == EPOLL_CTL_MOD)
2153 goto error_tgt_fput;
2154 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
2155 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
2156 goto error_tgt_fput;
2157 }
2158
2159
2160
2161
2162
2163 ep = f.file->private_data;
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180 error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
2181 if (error)
2182 goto error_tgt_fput;
2183 if (op == EPOLL_CTL_ADD) {
2184 if (!list_empty(&f.file->f_ep_links) ||
2185 ep->gen == loop_check_gen ||
2186 is_file_epoll(tf.file)) {
2187 mutex_unlock(&ep->mtx);
2188 error = epoll_mutex_lock(&epmutex, 0, nonblock);
2189 if (error)
2190 goto error_tgt_fput;
2191 loop_check_gen++;
2192 full_check = 1;
2193 if (is_file_epoll(tf.file)) {
2194 error = -ELOOP;
2195 if (ep_loop_check(ep, tf.file) != 0)
2196 goto error_tgt_fput;
2197 } else {
2198 get_file(tf.file);
2199 list_add(&tf.file->f_tfile_llink,
2200 &tfile_check_list);
2201 }
2202 error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
2203 if (error)
2204 goto error_tgt_fput;
2205 if (is_file_epoll(tf.file)) {
2206 tep = tf.file->private_data;
2207 error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
2208 if (error) {
2209 mutex_unlock(&ep->mtx);
2210 goto error_tgt_fput;
2211 }
2212 }
2213 }
2214 }
2215
2216
2217
2218
2219
2220
2221 epi = ep_find(ep, tf.file, fd);
2222
2223 error = -EINVAL;
2224 switch (op) {
2225 case EPOLL_CTL_ADD:
2226 if (!epi) {
2227 epds->events |= EPOLLERR | EPOLLHUP;
2228 error = ep_insert(ep, epds, tf.file, fd, full_check);
2229 } else
2230 error = -EEXIST;
2231 break;
2232 case EPOLL_CTL_DEL:
2233 if (epi)
2234 error = ep_remove(ep, epi);
2235 else
2236 error = -ENOENT;
2237 break;
2238 case EPOLL_CTL_MOD:
2239 if (epi) {
2240 if (!(epi->event.events & EPOLLEXCLUSIVE)) {
2241 epds->events |= EPOLLERR | EPOLLHUP;
2242 error = ep_modify(ep, epi, epds);
2243 }
2244 } else
2245 error = -ENOENT;
2246 break;
2247 }
2248 if (tep != NULL)
2249 mutex_unlock(&tep->mtx);
2250 mutex_unlock(&ep->mtx);
2251
2252error_tgt_fput:
2253 if (full_check) {
2254 clear_tfile_check_list();
2255 loop_check_gen++;
2256 mutex_unlock(&epmutex);
2257 }
2258
2259 fdput(tf);
2260error_fput:
2261 fdput(f);
2262error_return:
2263
2264 return error;
2265}
2266
2267
2268
2269
2270
2271
2272SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
2273 struct epoll_event __user *, event)
2274{
2275 struct epoll_event epds;
2276
2277 if (ep_op_has_event(op) &&
2278 copy_from_user(&epds, event, sizeof(struct epoll_event)))
2279 return -EFAULT;
2280
2281 return do_epoll_ctl(epfd, op, fd, &epds, false);
2282}
2283
2284
2285
2286
2287
2288static int do_epoll_wait(int epfd, struct epoll_event __user *events,
2289 int maxevents, int timeout)
2290{
2291 int error;
2292 struct fd f;
2293 struct eventpoll *ep;
2294
2295
2296 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
2297 return -EINVAL;
2298
2299
2300 if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
2301 return -EFAULT;
2302
2303
2304 f = fdget(epfd);
2305 if (!f.file)
2306 return -EBADF;
2307
2308
2309
2310
2311
2312 error = -EINVAL;
2313 if (!is_file_epoll(f.file))
2314 goto error_fput;
2315
2316
2317
2318
2319
2320 ep = f.file->private_data;
2321
2322
2323 error = ep_poll(ep, events, maxevents, timeout);
2324
2325error_fput:
2326 fdput(f);
2327 return error;
2328}
2329
2330SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
2331 int, maxevents, int, timeout)
2332{
2333 return do_epoll_wait(epfd, events, maxevents, timeout);
2334}
2335
2336
2337
2338
2339
2340SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
2341 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
2342 size_t, sigsetsize)
2343{
2344 int error;
2345
2346
2347
2348
2349
2350 error = set_user_sigmask(sigmask, sigsetsize);
2351 if (error)
2352 return error;
2353
2354 error = do_epoll_wait(epfd, events, maxevents, timeout);
2355 restore_saved_sigmask_unless(error == -EINTR);
2356
2357 return error;
2358}
2359
2360#ifdef CONFIG_COMPAT
2361COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2362 struct epoll_event __user *, events,
2363 int, maxevents, int, timeout,
2364 const compat_sigset_t __user *, sigmask,
2365 compat_size_t, sigsetsize)
2366{
2367 long err;
2368
2369
2370
2371
2372
2373 err = set_compat_user_sigmask(sigmask, sigsetsize);
2374 if (err)
2375 return err;
2376
2377 err = do_epoll_wait(epfd, events, maxevents, timeout);
2378 restore_saved_sigmask_unless(err == -EINTR);
2379
2380 return err;
2381}
2382#endif
2383
2384static int __init eventpoll_init(void)
2385{
2386 struct sysinfo si;
2387
2388 si_meminfo(&si);
2389
2390
2391
2392 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
2393 EP_ITEM_COST;
2394 BUG_ON(max_user_watches < 0);
2395
2396
2397
2398
2399
2400 ep_nested_calls_init(&poll_loop_ncalls);
2401
2402
2403
2404
2405
2406 BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
2407
2408
2409 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
2410 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
2411
2412
2413 pwq_cache = kmem_cache_create("eventpoll_pwq",
2414 sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
2415
2416 return 0;
2417}
2418fs_initcall(eventpoll_init);
2419