1
2
3
4
5
6
7
8
9#include <linux/init.h>
10#include <linux/kernel.h>
11#include <linux/sched/signal.h>
12#include <linux/fs.h>
13#include <linux/file.h>
14#include <linux/signal.h>
15#include <linux/errno.h>
16#include <linux/mm.h>
17#include <linux/slab.h>
18#include <linux/poll.h>
19#include <linux/string.h>
20#include <linux/list.h>
21#include <linux/hash.h>
22#include <linux/spinlock.h>
23#include <linux/syscalls.h>
24#include <linux/rbtree.h>
25#include <linux/wait.h>
26#include <linux/eventpoll.h>
27#include <linux/mount.h>
28#include <linux/bitops.h>
29#include <linux/mutex.h>
30#include <linux/anon_inodes.h>
31#include <linux/device.h>
32#include <linux/uaccess.h>
33#include <asm/io.h>
34#include <asm/mman.h>
35#include <linux/atomic.h>
36#include <linux/proc_fs.h>
37#include <linux/seq_file.h>
38#include <linux/compat.h>
39#include <linux/rculist.h>
40#include <net/busy_poll.h>
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
92
93#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
94
95#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
96 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
97
98
99#define EP_MAX_NESTS 4
100
101#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
102
103#define EP_UNACTIVE_PTR ((void *) -1L)
104
105#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
106
107struct epoll_filefd {
108 struct file *file;
109 int fd;
110} __packed;
111
112
113
114
115
116struct nested_call_node {
117 struct list_head llink;
118 void *cookie;
119 void *ctx;
120};
121
122
123
124
125
126struct nested_calls {
127 struct list_head tasks_call_list;
128 spinlock_t lock;
129};
130
131
132
133
134
135
136
137struct epitem {
138 union {
139
140 struct rb_node rbn;
141
142 struct rcu_head rcu;
143 };
144
145
146 struct list_head rdllink;
147
148
149
150
151
152 struct epitem *next;
153
154
155 struct epoll_filefd ffd;
156
157
158 int nwait;
159
160
161 struct list_head pwqlist;
162
163
164 struct eventpoll *ep;
165
166
167 struct list_head fllink;
168
169
170 struct wakeup_source __rcu *ws;
171
172
173 struct epoll_event event;
174};
175
176
177
178
179
180
181struct eventpoll {
182
183
184
185
186
187
188 struct mutex mtx;
189
190
191 wait_queue_head_t wq;
192
193
194 wait_queue_head_t poll_wait;
195
196
197 struct list_head rdllist;
198
199
200 rwlock_t lock;
201
202
203 struct rb_root_cached rbr;
204
205
206
207
208
209
210 struct epitem *ovflist;
211
212
213 struct wakeup_source *ws;
214
215
216 struct user_struct *user;
217
218 struct file *file;
219
220
221 int visited;
222 struct list_head visited_list_link;
223
224#ifdef CONFIG_NET_RX_BUSY_POLL
225
226 unsigned int napi_id;
227#endif
228};
229
230
231struct eppoll_entry {
232
233 struct list_head llink;
234
235
236 struct epitem *base;
237
238
239
240
241
242 wait_queue_entry_t wait;
243
244
245 wait_queue_head_t *whead;
246};
247
248
249struct ep_pqueue {
250 poll_table pt;
251 struct epitem *epi;
252};
253
254
255struct ep_send_events_data {
256 int maxevents;
257 struct epoll_event __user *events;
258 int res;
259};
260
261
262
263
264
265static long max_user_watches __read_mostly;
266
267
268
269
270static DEFINE_MUTEX(epmutex);
271
272
273static struct nested_calls poll_loop_ncalls;
274
275
276static struct kmem_cache *epi_cache __read_mostly;
277
278
279static struct kmem_cache *pwq_cache __read_mostly;
280
281
282static LIST_HEAD(visited_list);
283
284
285
286
287
288static LIST_HEAD(tfile_check_list);
289
290#ifdef CONFIG_SYSCTL
291
292#include <linux/sysctl.h>
293
294static long long_zero;
295static long long_max = LONG_MAX;
296
297struct ctl_table epoll_table[] = {
298 {
299 .procname = "max_user_watches",
300 .data = &max_user_watches,
301 .maxlen = sizeof(max_user_watches),
302 .mode = 0644,
303 .proc_handler = proc_doulongvec_minmax,
304 .extra1 = &long_zero,
305 .extra2 = &long_max,
306 },
307 { }
308};
309#endif
310
311static const struct file_operations eventpoll_fops;
312
313static inline int is_file_epoll(struct file *f)
314{
315 return f->f_op == &eventpoll_fops;
316}
317
318
319static inline void ep_set_ffd(struct epoll_filefd *ffd,
320 struct file *file, int fd)
321{
322 ffd->file = file;
323 ffd->fd = fd;
324}
325
326
327static inline int ep_cmp_ffd(struct epoll_filefd *p1,
328 struct epoll_filefd *p2)
329{
330 return (p1->file > p2->file ? +1:
331 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
332}
333
334
335static inline int ep_is_linked(struct epitem *epi)
336{
337 return !list_empty(&epi->rdllink);
338}
339
340static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
341{
342 return container_of(p, struct eppoll_entry, wait);
343}
344
345
346static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
347{
348 return container_of(p, struct eppoll_entry, wait)->base;
349}
350
351
352static inline struct epitem *ep_item_from_epqueue(poll_table *p)
353{
354 return container_of(p, struct ep_pqueue, pt)->epi;
355}
356
357
358static inline int ep_op_has_event(int op)
359{
360 return op != EPOLL_CTL_DEL;
361}
362
363
364static void ep_nested_calls_init(struct nested_calls *ncalls)
365{
366 INIT_LIST_HEAD(&ncalls->tasks_call_list);
367 spin_lock_init(&ncalls->lock);
368}
369
370
371
372
373
374
375
376
377
378static inline int ep_events_available(struct eventpoll *ep)
379{
380 return !list_empty_careful(&ep->rdllist) ||
381 READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
382}
383
384#ifdef CONFIG_NET_RX_BUSY_POLL
385static bool ep_busy_loop_end(void *p, unsigned long start_time)
386{
387 struct eventpoll *ep = p;
388
389 return ep_events_available(ep) || busy_loop_timeout(start_time);
390}
391
392
393
394
395
396
397
398static void ep_busy_loop(struct eventpoll *ep, int nonblock)
399{
400 unsigned int napi_id = READ_ONCE(ep->napi_id);
401
402 if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
403 napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
404}
405
406static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
407{
408 if (ep->napi_id)
409 ep->napi_id = 0;
410}
411
412
413
414
415static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
416{
417 struct eventpoll *ep;
418 unsigned int napi_id;
419 struct socket *sock;
420 struct sock *sk;
421 int err;
422
423 if (!net_busy_loop_on())
424 return;
425
426 sock = sock_from_file(epi->ffd.file, &err);
427 if (!sock)
428 return;
429
430 sk = sock->sk;
431 if (!sk)
432 return;
433
434 napi_id = READ_ONCE(sk->sk_napi_id);
435 ep = epi->ep;
436
437
438
439
440
441 if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
442 return;
443
444
445 ep->napi_id = napi_id;
446}
447
448#else
449
450static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
451{
452}
453
454static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
455{
456}
457
458static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
459{
460}
461
462#endif
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479static int ep_call_nested(struct nested_calls *ncalls,
480 int (*nproc)(void *, void *, int), void *priv,
481 void *cookie, void *ctx)
482{
483 int error, call_nests = 0;
484 unsigned long flags;
485 struct list_head *lsthead = &ncalls->tasks_call_list;
486 struct nested_call_node *tncur;
487 struct nested_call_node tnode;
488
489 spin_lock_irqsave(&ncalls->lock, flags);
490
491
492
493
494
495
496 list_for_each_entry(tncur, lsthead, llink) {
497 if (tncur->ctx == ctx &&
498 (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
499
500
501
502
503 error = -1;
504 goto out_unlock;
505 }
506 }
507
508
509 tnode.ctx = ctx;
510 tnode.cookie = cookie;
511 list_add(&tnode.llink, lsthead);
512
513 spin_unlock_irqrestore(&ncalls->lock, flags);
514
515
516 error = (*nproc)(priv, cookie, call_nests);
517
518
519 spin_lock_irqsave(&ncalls->lock, flags);
520 list_del(&tnode.llink);
521out_unlock:
522 spin_unlock_irqrestore(&ncalls->lock, flags);
523
524 return error;
525}
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552#ifdef CONFIG_DEBUG_LOCK_ALLOC
553
554static DEFINE_PER_CPU(int, wakeup_nest);
555
556static void ep_poll_safewake(wait_queue_head_t *wq)
557{
558 unsigned long flags;
559 int subclass;
560
561 local_irq_save(flags);
562 preempt_disable();
563 subclass = __this_cpu_read(wakeup_nest);
564 spin_lock_nested(&wq->lock, subclass + 1);
565 __this_cpu_inc(wakeup_nest);
566 wake_up_locked_poll(wq, POLLIN);
567 __this_cpu_dec(wakeup_nest);
568 spin_unlock(&wq->lock);
569 local_irq_restore(flags);
570 preempt_enable();
571}
572
573#else
574
575static void ep_poll_safewake(wait_queue_head_t *wq)
576{
577 wake_up_poll(wq, EPOLLIN);
578}
579
580#endif
581
582static void ep_remove_wait_queue(struct eppoll_entry *pwq)
583{
584 wait_queue_head_t *whead;
585
586 rcu_read_lock();
587
588
589
590
591
592
593 whead = smp_load_acquire(&pwq->whead);
594 if (whead)
595 remove_wait_queue(whead, &pwq->wait);
596 rcu_read_unlock();
597}
598
599
600
601
602
603
604static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
605{
606 struct list_head *lsthead = &epi->pwqlist;
607 struct eppoll_entry *pwq;
608
609 while (!list_empty(lsthead)) {
610 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
611
612 list_del(&pwq->llink);
613 ep_remove_wait_queue(pwq);
614 kmem_cache_free(pwq_cache, pwq);
615 }
616}
617
618
619static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
620{
621 return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
622}
623
624
625static inline void ep_pm_stay_awake(struct epitem *epi)
626{
627 struct wakeup_source *ws = ep_wakeup_source(epi);
628
629 if (ws)
630 __pm_stay_awake(ws);
631}
632
633static inline bool ep_has_wakeup_source(struct epitem *epi)
634{
635 return rcu_access_pointer(epi->ws) ? true : false;
636}
637
638
639static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
640{
641 struct wakeup_source *ws;
642
643 rcu_read_lock();
644 ws = rcu_dereference(epi->ws);
645 if (ws)
646 __pm_stay_awake(ws);
647 rcu_read_unlock();
648}
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663static __poll_t ep_scan_ready_list(struct eventpoll *ep,
664 __poll_t (*sproc)(struct eventpoll *,
665 struct list_head *, void *),
666 void *priv, int depth, bool ep_locked)
667{
668 __poll_t res;
669 struct epitem *epi, *nepi;
670 LIST_HEAD(txlist);
671
672 lockdep_assert_irqs_enabled();
673
674
675
676
677
678
679 if (!ep_locked)
680 mutex_lock_nested(&ep->mtx, depth);
681
682
683
684
685
686
687
688
689
690 write_lock_irq(&ep->lock);
691 list_splice_init(&ep->rdllist, &txlist);
692 WRITE_ONCE(ep->ovflist, NULL);
693 write_unlock_irq(&ep->lock);
694
695
696
697
698 res = (*sproc)(ep, &txlist, priv);
699
700 write_lock_irq(&ep->lock);
701
702
703
704
705
706 for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
707 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
708
709
710
711
712
713
714 if (!ep_is_linked(epi)) {
715
716
717
718
719 list_add(&epi->rdllink, &ep->rdllist);
720 ep_pm_stay_awake(epi);
721 }
722 }
723
724
725
726
727
728 WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
729
730
731
732
733 list_splice(&txlist, &ep->rdllist);
734 __pm_relax(ep->ws);
735 write_unlock_irq(&ep->lock);
736
737 if (!ep_locked)
738 mutex_unlock(&ep->mtx);
739
740 return res;
741}
742
743static void epi_rcu_free(struct rcu_head *head)
744{
745 struct epitem *epi = container_of(head, struct epitem, rcu);
746 kmem_cache_free(epi_cache, epi);
747}
748
749
750
751
752
753static int ep_remove(struct eventpoll *ep, struct epitem *epi)
754{
755 struct file *file = epi->ffd.file;
756
757 lockdep_assert_irqs_enabled();
758
759
760
761
762 ep_unregister_pollwait(ep, epi);
763
764
765 spin_lock(&file->f_lock);
766 list_del_rcu(&epi->fllink);
767 spin_unlock(&file->f_lock);
768
769 rb_erase_cached(&epi->rbn, &ep->rbr);
770
771 write_lock_irq(&ep->lock);
772 if (ep_is_linked(epi))
773 list_del_init(&epi->rdllink);
774 write_unlock_irq(&ep->lock);
775
776 wakeup_source_unregister(ep_wakeup_source(epi));
777
778
779
780
781
782
783
784 call_rcu(&epi->rcu, epi_rcu_free);
785
786 atomic_long_dec(&ep->user->epoll_watches);
787
788 return 0;
789}
790
791static void ep_free(struct eventpoll *ep)
792{
793 struct rb_node *rbp;
794 struct epitem *epi;
795
796
797 if (waitqueue_active(&ep->poll_wait))
798 ep_poll_safewake(&ep->poll_wait);
799
800
801
802
803
804
805
806
807
808 mutex_lock(&epmutex);
809
810
811
812
813 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
814 epi = rb_entry(rbp, struct epitem, rbn);
815
816 ep_unregister_pollwait(ep, epi);
817 cond_resched();
818 }
819
820
821
822
823
824
825
826
827
828 mutex_lock(&ep->mtx);
829 while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
830 epi = rb_entry(rbp, struct epitem, rbn);
831 ep_remove(ep, epi);
832 cond_resched();
833 }
834 mutex_unlock(&ep->mtx);
835
836 mutex_unlock(&epmutex);
837 mutex_destroy(&ep->mtx);
838 free_uid(ep->user);
839 wakeup_source_unregister(ep->ws);
840 kfree(ep);
841}
842
843static int ep_eventpoll_release(struct inode *inode, struct file *file)
844{
845 struct eventpoll *ep = file->private_data;
846
847 if (ep)
848 ep_free(ep);
849
850 return 0;
851}
852
853static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
854 void *priv);
855static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
856 poll_table *pt);
857
858
859
860
861
862
863static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
864 int depth)
865{
866 struct eventpoll *ep;
867 bool locked;
868
869 pt->_key = epi->event.events;
870 if (!is_file_epoll(epi->ffd.file))
871 return vfs_poll(epi->ffd.file, pt) & epi->event.events;
872
873 ep = epi->ffd.file->private_data;
874 poll_wait(epi->ffd.file, &ep->poll_wait, pt);
875 locked = pt && (pt->_qproc == ep_ptable_queue_proc);
876
877 return ep_scan_ready_list(epi->ffd.file->private_data,
878 ep_read_events_proc, &depth, depth,
879 locked) & epi->event.events;
880}
881
882static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
883 void *priv)
884{
885 struct epitem *epi, *tmp;
886 poll_table pt;
887 int depth = *(int *)priv;
888
889 init_poll_funcptr(&pt, NULL);
890 depth++;
891
892 list_for_each_entry_safe(epi, tmp, head, rdllink) {
893 if (ep_item_poll(epi, &pt, depth)) {
894 return EPOLLIN | EPOLLRDNORM;
895 } else {
896
897
898
899
900
901 __pm_relax(ep_wakeup_source(epi));
902 list_del_init(&epi->rdllink);
903 }
904 }
905
906 return 0;
907}
908
909static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
910{
911 struct eventpoll *ep = file->private_data;
912 int depth = 0;
913
914
915 poll_wait(file, &ep->poll_wait, wait);
916
917
918
919
920
921 return ep_scan_ready_list(ep, ep_read_events_proc,
922 &depth, depth, false);
923}
924
925#ifdef CONFIG_PROC_FS
926static void ep_show_fdinfo(struct seq_file *m, struct file *f)
927{
928 struct eventpoll *ep = f->private_data;
929 struct rb_node *rbp;
930
931 mutex_lock(&ep->mtx);
932 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
933 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
934 struct inode *inode = file_inode(epi->ffd.file);
935
936 seq_printf(m, "tfd: %8d events: %8x data: %16llx "
937 " pos:%lli ino:%lx sdev:%x\n",
938 epi->ffd.fd, epi->event.events,
939 (long long)epi->event.data,
940 (long long)epi->ffd.file->f_pos,
941 inode->i_ino, inode->i_sb->s_dev);
942 if (seq_has_overflowed(m))
943 break;
944 }
945 mutex_unlock(&ep->mtx);
946}
947#endif
948
949
950static const struct file_operations eventpoll_fops = {
951#ifdef CONFIG_PROC_FS
952 .show_fdinfo = ep_show_fdinfo,
953#endif
954 .release = ep_eventpoll_release,
955 .poll = ep_eventpoll_poll,
956 .llseek = noop_llseek,
957};
958
959
960
961
962
963
964void eventpoll_release_file(struct file *file)
965{
966 struct eventpoll *ep;
967 struct epitem *epi, *next;
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982 mutex_lock(&epmutex);
983 list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
984 ep = epi->ep;
985 mutex_lock_nested(&ep->mtx, 0);
986 ep_remove(ep, epi);
987 mutex_unlock(&ep->mtx);
988 }
989 mutex_unlock(&epmutex);
990}
991
992static int ep_alloc(struct eventpoll **pep)
993{
994 int error;
995 struct user_struct *user;
996 struct eventpoll *ep;
997
998 user = get_current_user();
999 error = -ENOMEM;
1000 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
1001 if (unlikely(!ep))
1002 goto free_uid;
1003
1004 mutex_init(&ep->mtx);
1005 rwlock_init(&ep->lock);
1006 init_waitqueue_head(&ep->wq);
1007 init_waitqueue_head(&ep->poll_wait);
1008 INIT_LIST_HEAD(&ep->rdllist);
1009 ep->rbr = RB_ROOT_CACHED;
1010 ep->ovflist = EP_UNACTIVE_PTR;
1011 ep->user = user;
1012
1013 *pep = ep;
1014
1015 return 0;
1016
1017free_uid:
1018 free_uid(user);
1019 return error;
1020}
1021
1022
1023
1024
1025
1026
1027static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
1028{
1029 int kcmp;
1030 struct rb_node *rbp;
1031 struct epitem *epi, *epir = NULL;
1032 struct epoll_filefd ffd;
1033
1034 ep_set_ffd(&ffd, file, fd);
1035 for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
1036 epi = rb_entry(rbp, struct epitem, rbn);
1037 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
1038 if (kcmp > 0)
1039 rbp = rbp->rb_right;
1040 else if (kcmp < 0)
1041 rbp = rbp->rb_left;
1042 else {
1043 epir = epi;
1044 break;
1045 }
1046 }
1047
1048 return epir;
1049}
1050
1051#ifdef CONFIG_CHECKPOINT_RESTORE
1052static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
1053{
1054 struct rb_node *rbp;
1055 struct epitem *epi;
1056
1057 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1058 epi = rb_entry(rbp, struct epitem, rbn);
1059 if (epi->ffd.fd == tfd) {
1060 if (toff == 0)
1061 return epi;
1062 else
1063 toff--;
1064 }
1065 cond_resched();
1066 }
1067
1068 return NULL;
1069}
1070
1071struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
1072 unsigned long toff)
1073{
1074 struct file *file_raw;
1075 struct eventpoll *ep;
1076 struct epitem *epi;
1077
1078 if (!is_file_epoll(file))
1079 return ERR_PTR(-EINVAL);
1080
1081 ep = file->private_data;
1082
1083 mutex_lock(&ep->mtx);
1084 epi = ep_find_tfd(ep, tfd, toff);
1085 if (epi)
1086 file_raw = epi->ffd.file;
1087 else
1088 file_raw = ERR_PTR(-ENOENT);
1089 mutex_unlock(&ep->mtx);
1090
1091 return file_raw;
1092}
1093#endif
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113static inline bool list_add_tail_lockless(struct list_head *new,
1114 struct list_head *head)
1115{
1116 struct list_head *prev;
1117
1118
1119
1120
1121
1122
1123
1124 if (cmpxchg(&new->next, new, head) != new)
1125 return false;
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135 prev = xchg(&head->prev, new);
1136
1137
1138
1139
1140
1141
1142 prev->next = new;
1143 new->prev = prev;
1144
1145 return true;
1146}
1147
1148
1149
1150
1151
1152
1153
1154static inline bool chain_epi_lockless(struct epitem *epi)
1155{
1156 struct eventpoll *ep = epi->ep;
1157
1158
1159 if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
1160 return false;
1161
1162
1163 epi->next = xchg(&ep->ovflist, epi);
1164
1165 return true;
1166}
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
1187{
1188 int pwake = 0;
1189 struct epitem *epi = ep_item_from_wait(wait);
1190 struct eventpoll *ep = epi->ep;
1191 __poll_t pollflags = key_to_poll(key);
1192 unsigned long flags;
1193 int ewake = 0;
1194
1195 read_lock_irqsave(&ep->lock, flags);
1196
1197 ep_set_busy_poll_napi_id(epi);
1198
1199
1200
1201
1202
1203
1204
1205 if (!(epi->event.events & ~EP_PRIVATE_BITS))
1206 goto out_unlock;
1207
1208
1209
1210
1211
1212
1213
1214 if (pollflags && !(pollflags & epi->event.events))
1215 goto out_unlock;
1216
1217
1218
1219
1220
1221
1222
1223 if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
1224 if (epi->next == EP_UNACTIVE_PTR &&
1225 chain_epi_lockless(epi))
1226 ep_pm_stay_awake_rcu(epi);
1227 goto out_unlock;
1228 }
1229
1230
1231 if (!ep_is_linked(epi) &&
1232 list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
1233 ep_pm_stay_awake_rcu(epi);
1234 }
1235
1236
1237
1238
1239
1240 if (waitqueue_active(&ep->wq)) {
1241 if ((epi->event.events & EPOLLEXCLUSIVE) &&
1242 !(pollflags & POLLFREE)) {
1243 switch (pollflags & EPOLLINOUT_BITS) {
1244 case EPOLLIN:
1245 if (epi->event.events & EPOLLIN)
1246 ewake = 1;
1247 break;
1248 case EPOLLOUT:
1249 if (epi->event.events & EPOLLOUT)
1250 ewake = 1;
1251 break;
1252 case 0:
1253 ewake = 1;
1254 break;
1255 }
1256 }
1257 wake_up(&ep->wq);
1258 }
1259 if (waitqueue_active(&ep->poll_wait))
1260 pwake++;
1261
1262out_unlock:
1263 read_unlock_irqrestore(&ep->lock, flags);
1264
1265
1266 if (pwake)
1267 ep_poll_safewake(&ep->poll_wait);
1268
1269 if (!(epi->event.events & EPOLLEXCLUSIVE))
1270 ewake = 1;
1271
1272 if (pollflags & POLLFREE) {
1273
1274
1275
1276
1277
1278 list_del_init(&wait->entry);
1279
1280
1281
1282
1283
1284
1285 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
1286 }
1287
1288 return ewake;
1289}
1290
1291
1292
1293
1294
1295static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
1296 poll_table *pt)
1297{
1298 struct epitem *epi = ep_item_from_epqueue(pt);
1299 struct eppoll_entry *pwq;
1300
1301 if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
1302 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
1303 pwq->whead = whead;
1304 pwq->base = epi;
1305 if (epi->event.events & EPOLLEXCLUSIVE)
1306 add_wait_queue_exclusive(whead, &pwq->wait);
1307 else
1308 add_wait_queue(whead, &pwq->wait);
1309 list_add_tail(&pwq->llink, &epi->pwqlist);
1310 epi->nwait++;
1311 } else {
1312
1313 epi->nwait = -1;
1314 }
1315}
1316
1317static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1318{
1319 int kcmp;
1320 struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
1321 struct epitem *epic;
1322 bool leftmost = true;
1323
1324 while (*p) {
1325 parent = *p;
1326 epic = rb_entry(parent, struct epitem, rbn);
1327 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1328 if (kcmp > 0) {
1329 p = &parent->rb_right;
1330 leftmost = false;
1331 } else
1332 p = &parent->rb_left;
1333 }
1334 rb_link_node(&epi->rbn, parent, p);
1335 rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
1336}
1337
1338
1339
1340#define PATH_ARR_SIZE 5
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
1353static int path_count[PATH_ARR_SIZE];
1354
1355static int path_count_inc(int nests)
1356{
1357
1358 if (nests == 0)
1359 return 0;
1360
1361 if (++path_count[nests] > path_limits[nests])
1362 return -1;
1363 return 0;
1364}
1365
1366static void path_count_init(void)
1367{
1368 int i;
1369
1370 for (i = 0; i < PATH_ARR_SIZE; i++)
1371 path_count[i] = 0;
1372}
1373
1374static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1375{
1376 int error = 0;
1377 struct file *file = priv;
1378 struct file *child_file;
1379 struct epitem *epi;
1380
1381
1382 rcu_read_lock();
1383 list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
1384 child_file = epi->ep->file;
1385 if (is_file_epoll(child_file)) {
1386 if (list_empty(&child_file->f_ep_links)) {
1387 if (path_count_inc(call_nests)) {
1388 error = -1;
1389 break;
1390 }
1391 } else {
1392 error = ep_call_nested(&poll_loop_ncalls,
1393 reverse_path_check_proc,
1394 child_file, child_file,
1395 current);
1396 }
1397 if (error != 0)
1398 break;
1399 } else {
1400 printk(KERN_ERR "reverse_path_check_proc: "
1401 "file is not an ep!\n");
1402 }
1403 }
1404 rcu_read_unlock();
1405 return error;
1406}
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418static int reverse_path_check(void)
1419{
1420 int error = 0;
1421 struct file *current_file;
1422
1423
1424 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1425 path_count_init();
1426 error = ep_call_nested(&poll_loop_ncalls,
1427 reverse_path_check_proc, current_file,
1428 current_file, current);
1429 if (error)
1430 break;
1431 }
1432 return error;
1433}
1434
1435static int ep_create_wakeup_source(struct epitem *epi)
1436{
1437 const char *name;
1438 struct wakeup_source *ws;
1439
1440 if (!epi->ep->ws) {
1441 epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
1442 if (!epi->ep->ws)
1443 return -ENOMEM;
1444 }
1445
1446 name = epi->ffd.file->f_path.dentry->d_name.name;
1447 ws = wakeup_source_register(NULL, name);
1448
1449 if (!ws)
1450 return -ENOMEM;
1451 rcu_assign_pointer(epi->ws, ws);
1452
1453 return 0;
1454}
1455
1456
1457static noinline void ep_destroy_wakeup_source(struct epitem *epi)
1458{
1459 struct wakeup_source *ws = ep_wakeup_source(epi);
1460
1461 RCU_INIT_POINTER(epi->ws, NULL);
1462
1463
1464
1465
1466
1467
1468 synchronize_rcu();
1469 wakeup_source_unregister(ws);
1470}
1471
1472
1473
1474
1475static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
1476 struct file *tfile, int fd, int full_check)
1477{
1478 int error, pwake = 0;
1479 __poll_t revents;
1480 long user_watches;
1481 struct epitem *epi;
1482 struct ep_pqueue epq;
1483
1484 lockdep_assert_irqs_enabled();
1485
1486 user_watches = atomic_long_read(&ep->user->epoll_watches);
1487 if (unlikely(user_watches >= max_user_watches))
1488 return -ENOSPC;
1489 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
1490 return -ENOMEM;
1491
1492
1493 INIT_LIST_HEAD(&epi->rdllink);
1494 INIT_LIST_HEAD(&epi->fllink);
1495 INIT_LIST_HEAD(&epi->pwqlist);
1496 epi->ep = ep;
1497 ep_set_ffd(&epi->ffd, tfile, fd);
1498 epi->event = *event;
1499 epi->nwait = 0;
1500 epi->next = EP_UNACTIVE_PTR;
1501 if (epi->event.events & EPOLLWAKEUP) {
1502 error = ep_create_wakeup_source(epi);
1503 if (error)
1504 goto error_create_wakeup_source;
1505 } else {
1506 RCU_INIT_POINTER(epi->ws, NULL);
1507 }
1508
1509
1510 epq.epi = epi;
1511 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1512
1513
1514
1515
1516
1517
1518
1519
1520 revents = ep_item_poll(epi, &epq.pt, 1);
1521
1522
1523
1524
1525
1526
1527 error = -ENOMEM;
1528 if (epi->nwait < 0)
1529 goto error_unregister;
1530
1531
1532 spin_lock(&tfile->f_lock);
1533 list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
1534 spin_unlock(&tfile->f_lock);
1535
1536
1537
1538
1539
1540 ep_rbtree_insert(ep, epi);
1541
1542
1543 error = -EINVAL;
1544 if (full_check && reverse_path_check())
1545 goto error_remove_epi;
1546
1547
1548 write_lock_irq(&ep->lock);
1549
1550
1551 ep_set_busy_poll_napi_id(epi);
1552
1553
1554 if (revents && !ep_is_linked(epi)) {
1555 list_add_tail(&epi->rdllink, &ep->rdllist);
1556 ep_pm_stay_awake(epi);
1557
1558
1559 if (waitqueue_active(&ep->wq))
1560 wake_up(&ep->wq);
1561 if (waitqueue_active(&ep->poll_wait))
1562 pwake++;
1563 }
1564
1565 write_unlock_irq(&ep->lock);
1566
1567 atomic_long_inc(&ep->user->epoll_watches);
1568
1569
1570 if (pwake)
1571 ep_poll_safewake(&ep->poll_wait);
1572
1573 return 0;
1574
1575error_remove_epi:
1576 spin_lock(&tfile->f_lock);
1577 list_del_rcu(&epi->fllink);
1578 spin_unlock(&tfile->f_lock);
1579
1580 rb_erase_cached(&epi->rbn, &ep->rbr);
1581
1582error_unregister:
1583 ep_unregister_pollwait(ep, epi);
1584
1585
1586
1587
1588
1589
1590
1591 write_lock_irq(&ep->lock);
1592 if (ep_is_linked(epi))
1593 list_del_init(&epi->rdllink);
1594 write_unlock_irq(&ep->lock);
1595
1596 wakeup_source_unregister(ep_wakeup_source(epi));
1597
1598error_create_wakeup_source:
1599 kmem_cache_free(epi_cache, epi);
1600
1601 return error;
1602}
1603
1604
1605
1606
1607
1608static int ep_modify(struct eventpoll *ep, struct epitem *epi,
1609 const struct epoll_event *event)
1610{
1611 int pwake = 0;
1612 poll_table pt;
1613
1614 lockdep_assert_irqs_enabled();
1615
1616 init_poll_funcptr(&pt, NULL);
1617
1618
1619
1620
1621
1622
1623 epi->event.events = event->events;
1624 epi->event.data = event->data;
1625 if (epi->event.events & EPOLLWAKEUP) {
1626 if (!ep_has_wakeup_source(epi))
1627 ep_create_wakeup_source(epi);
1628 } else if (ep_has_wakeup_source(epi)) {
1629 ep_destroy_wakeup_source(epi);
1630 }
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650 smp_mb();
1651
1652
1653
1654
1655
1656
1657
1658 if (ep_item_poll(epi, &pt, 1)) {
1659 write_lock_irq(&ep->lock);
1660 if (!ep_is_linked(epi)) {
1661 list_add_tail(&epi->rdllink, &ep->rdllist);
1662 ep_pm_stay_awake(epi);
1663
1664
1665 if (waitqueue_active(&ep->wq))
1666 wake_up(&ep->wq);
1667 if (waitqueue_active(&ep->poll_wait))
1668 pwake++;
1669 }
1670 write_unlock_irq(&ep->lock);
1671 }
1672
1673
1674 if (pwake)
1675 ep_poll_safewake(&ep->poll_wait);
1676
1677 return 0;
1678}
1679
1680static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1681 void *priv)
1682{
1683 struct ep_send_events_data *esed = priv;
1684 __poll_t revents;
1685 struct epitem *epi, *tmp;
1686 struct epoll_event __user *uevent = esed->events;
1687 struct wakeup_source *ws;
1688 poll_table pt;
1689
1690 init_poll_funcptr(&pt, NULL);
1691 esed->res = 0;
1692
1693
1694
1695
1696
1697
1698 lockdep_assert_held(&ep->mtx);
1699
1700 list_for_each_entry_safe(epi, tmp, head, rdllink) {
1701 if (esed->res >= esed->maxevents)
1702 break;
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713 ws = ep_wakeup_source(epi);
1714 if (ws) {
1715 if (ws->active)
1716 __pm_stay_awake(ep->ws);
1717 __pm_relax(ws);
1718 }
1719
1720 list_del_init(&epi->rdllink);
1721
1722
1723
1724
1725
1726
1727
1728 revents = ep_item_poll(epi, &pt, 1);
1729 if (!revents)
1730 continue;
1731
1732 if (__put_user(revents, &uevent->events) ||
1733 __put_user(epi->event.data, &uevent->data)) {
1734 list_add(&epi->rdllink, head);
1735 ep_pm_stay_awake(epi);
1736 if (!esed->res)
1737 esed->res = -EFAULT;
1738 return 0;
1739 }
1740 esed->res++;
1741 uevent++;
1742 if (epi->event.events & EPOLLONESHOT)
1743 epi->event.events &= EP_PRIVATE_BITS;
1744 else if (!(epi->event.events & EPOLLET)) {
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756 list_add_tail(&epi->rdllink, &ep->rdllist);
1757 ep_pm_stay_awake(epi);
1758 }
1759 }
1760
1761 return 0;
1762}
1763
1764static int ep_send_events(struct eventpoll *ep,
1765 struct epoll_event __user *events, int maxevents)
1766{
1767 struct ep_send_events_data esed;
1768
1769 esed.maxevents = maxevents;
1770 esed.events = events;
1771
1772 ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
1773 return esed.res;
1774}
1775
1776static inline struct timespec64 ep_set_mstimeout(long ms)
1777{
1778 struct timespec64 now, ts = {
1779 .tv_sec = ms / MSEC_PER_SEC,
1780 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1781 };
1782
1783 ktime_get_ts64(&now);
1784 return timespec64_add_safe(now, ts);
1785}
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1805 int maxevents, long timeout)
1806{
1807 int res = 0, eavail, timed_out = 0;
1808 u64 slack = 0;
1809 bool waiter = false;
1810 wait_queue_entry_t wait;
1811 ktime_t expires, *to = NULL;
1812
1813 lockdep_assert_irqs_enabled();
1814
1815 if (timeout > 0) {
1816 struct timespec64 end_time = ep_set_mstimeout(timeout);
1817
1818 slack = select_estimate_accuracy(&end_time);
1819 to = &expires;
1820 *to = timespec64_to_ktime(end_time);
1821 } else if (timeout == 0) {
1822
1823
1824
1825
1826
1827
1828
1829 timed_out = 1;
1830
1831 write_lock_irq(&ep->lock);
1832 eavail = ep_events_available(ep);
1833 write_unlock_irq(&ep->lock);
1834
1835 goto send_events;
1836 }
1837
1838fetch_events:
1839
1840 if (!ep_events_available(ep))
1841 ep_busy_loop(ep, timed_out);
1842
1843 eavail = ep_events_available(ep);
1844 if (eavail)
1845 goto send_events;
1846
1847
1848
1849
1850
1851
1852 ep_reset_busy_poll_napi_id(ep);
1853
1854
1855
1856
1857
1858
1859 if (!waiter) {
1860 waiter = true;
1861 init_waitqueue_entry(&wait, current);
1862
1863 spin_lock_irq(&ep->wq.lock);
1864 __add_wait_queue_exclusive(&ep->wq, &wait);
1865 spin_unlock_irq(&ep->wq.lock);
1866 }
1867
1868 for (;;) {
1869
1870
1871
1872
1873
1874 set_current_state(TASK_INTERRUPTIBLE);
1875
1876
1877
1878
1879
1880
1881 if (fatal_signal_pending(current)) {
1882 res = -EINTR;
1883 break;
1884 }
1885
1886 eavail = ep_events_available(ep);
1887 if (eavail)
1888 break;
1889 if (signal_pending(current)) {
1890 res = -EINTR;
1891 break;
1892 }
1893
1894 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
1895 timed_out = 1;
1896 break;
1897 }
1898 }
1899
1900 __set_current_state(TASK_RUNNING);
1901
1902send_events:
1903
1904
1905
1906
1907
1908 if (!res && eavail &&
1909 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1910 goto fetch_events;
1911
1912 if (waiter) {
1913 spin_lock_irq(&ep->wq.lock);
1914 __remove_wait_queue(&ep->wq, &wait);
1915 spin_unlock_irq(&ep->wq.lock);
1916 }
1917
1918 return res;
1919}
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1937{
1938 int error = 0;
1939 struct file *file = priv;
1940 struct eventpoll *ep = file->private_data;
1941 struct eventpoll *ep_tovisit;
1942 struct rb_node *rbp;
1943 struct epitem *epi;
1944
1945 mutex_lock_nested(&ep->mtx, call_nests + 1);
1946 ep->visited = 1;
1947 list_add(&ep->visited_list_link, &visited_list);
1948 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1949 epi = rb_entry(rbp, struct epitem, rbn);
1950 if (unlikely(is_file_epoll(epi->ffd.file))) {
1951 ep_tovisit = epi->ffd.file->private_data;
1952 if (ep_tovisit->visited)
1953 continue;
1954 error = ep_call_nested(&poll_loop_ncalls,
1955 ep_loop_check_proc, epi->ffd.file,
1956 ep_tovisit, current);
1957 if (error != 0)
1958 break;
1959 } else {
1960
1961
1962
1963
1964
1965
1966
1967
1968 if (list_empty(&epi->ffd.file->f_tfile_llink))
1969 list_add(&epi->ffd.file->f_tfile_llink,
1970 &tfile_check_list);
1971 }
1972 }
1973 mutex_unlock(&ep->mtx);
1974
1975 return error;
1976}
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989static int ep_loop_check(struct eventpoll *ep, struct file *file)
1990{
1991 int ret;
1992 struct eventpoll *ep_cur, *ep_next;
1993
1994 ret = ep_call_nested(&poll_loop_ncalls,
1995 ep_loop_check_proc, file, ep, current);
1996
1997 list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1998 visited_list_link) {
1999 ep_cur->visited = 0;
2000 list_del(&ep_cur->visited_list_link);
2001 }
2002 return ret;
2003}
2004
2005static void clear_tfile_check_list(void)
2006{
2007 struct file *file;
2008
2009
2010 while (!list_empty(&tfile_check_list)) {
2011 file = list_first_entry(&tfile_check_list, struct file,
2012 f_tfile_llink);
2013 list_del_init(&file->f_tfile_llink);
2014 }
2015 INIT_LIST_HEAD(&tfile_check_list);
2016}
2017
2018
2019
2020
2021static int do_epoll_create(int flags)
2022{
2023 int error, fd;
2024 struct eventpoll *ep = NULL;
2025 struct file *file;
2026
2027
2028 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
2029
2030 if (flags & ~EPOLL_CLOEXEC)
2031 return -EINVAL;
2032
2033
2034
2035 error = ep_alloc(&ep);
2036 if (error < 0)
2037 return error;
2038
2039
2040
2041
2042 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
2043 if (fd < 0) {
2044 error = fd;
2045 goto out_free_ep;
2046 }
2047 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
2048 O_RDWR | (flags & O_CLOEXEC));
2049 if (IS_ERR(file)) {
2050 error = PTR_ERR(file);
2051 goto out_free_fd;
2052 }
2053 ep->file = file;
2054 fd_install(fd, file);
2055 return fd;
2056
2057out_free_fd:
2058 put_unused_fd(fd);
2059out_free_ep:
2060 ep_free(ep);
2061 return error;
2062}
2063
2064SYSCALL_DEFINE1(epoll_create1, int, flags)
2065{
2066 return do_epoll_create(flags);
2067}
2068
2069SYSCALL_DEFINE1(epoll_create, int, size)
2070{
2071 if (size <= 0)
2072 return -EINVAL;
2073
2074 return do_epoll_create(0);
2075}
2076
2077
2078
2079
2080
2081
2082SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
2083 struct epoll_event __user *, event)
2084{
2085 int error;
2086 int full_check = 0;
2087 struct fd f, tf;
2088 struct eventpoll *ep;
2089 struct epitem *epi;
2090 struct epoll_event epds;
2091 struct eventpoll *tep = NULL;
2092
2093 error = -EFAULT;
2094 if (ep_op_has_event(op) &&
2095 copy_from_user(&epds, event, sizeof(struct epoll_event)))
2096 goto error_return;
2097
2098 error = -EBADF;
2099 f = fdget(epfd);
2100 if (!f.file)
2101 goto error_return;
2102
2103
2104 tf = fdget(fd);
2105 if (!tf.file)
2106 goto error_fput;
2107
2108
2109 error = -EPERM;
2110 if (!file_can_poll(tf.file))
2111 goto error_tgt_fput;
2112
2113
2114 if (ep_op_has_event(op))
2115 ep_take_care_of_epollwakeup(&epds);
2116
2117
2118
2119
2120
2121
2122 error = -EINVAL;
2123 if (f.file == tf.file || !is_file_epoll(f.file))
2124 goto error_tgt_fput;
2125
2126
2127
2128
2129
2130
2131 if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
2132 if (op == EPOLL_CTL_MOD)
2133 goto error_tgt_fput;
2134 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
2135 (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
2136 goto error_tgt_fput;
2137 }
2138
2139
2140
2141
2142
2143 ep = f.file->private_data;
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160 mutex_lock_nested(&ep->mtx, 0);
2161 if (op == EPOLL_CTL_ADD) {
2162 if (!list_empty(&f.file->f_ep_links) ||
2163 is_file_epoll(tf.file)) {
2164 full_check = 1;
2165 mutex_unlock(&ep->mtx);
2166 mutex_lock(&epmutex);
2167 if (is_file_epoll(tf.file)) {
2168 error = -ELOOP;
2169 if (ep_loop_check(ep, tf.file) != 0) {
2170 clear_tfile_check_list();
2171 goto error_tgt_fput;
2172 }
2173 } else
2174 list_add(&tf.file->f_tfile_llink,
2175 &tfile_check_list);
2176 mutex_lock_nested(&ep->mtx, 0);
2177 if (is_file_epoll(tf.file)) {
2178 tep = tf.file->private_data;
2179 mutex_lock_nested(&tep->mtx, 1);
2180 }
2181 }
2182 }
2183
2184
2185
2186
2187
2188
2189 epi = ep_find(ep, tf.file, fd);
2190
2191 error = -EINVAL;
2192 switch (op) {
2193 case EPOLL_CTL_ADD:
2194 if (!epi) {
2195 epds.events |= EPOLLERR | EPOLLHUP;
2196 error = ep_insert(ep, &epds, tf.file, fd, full_check);
2197 } else
2198 error = -EEXIST;
2199 if (full_check)
2200 clear_tfile_check_list();
2201 break;
2202 case EPOLL_CTL_DEL:
2203 if (epi)
2204 error = ep_remove(ep, epi);
2205 else
2206 error = -ENOENT;
2207 break;
2208 case EPOLL_CTL_MOD:
2209 if (epi) {
2210 if (!(epi->event.events & EPOLLEXCLUSIVE)) {
2211 epds.events |= EPOLLERR | EPOLLHUP;
2212 error = ep_modify(ep, epi, &epds);
2213 }
2214 } else
2215 error = -ENOENT;
2216 break;
2217 }
2218 if (tep != NULL)
2219 mutex_unlock(&tep->mtx);
2220 mutex_unlock(&ep->mtx);
2221
2222error_tgt_fput:
2223 if (full_check)
2224 mutex_unlock(&epmutex);
2225
2226 fdput(tf);
2227error_fput:
2228 fdput(f);
2229error_return:
2230
2231 return error;
2232}
2233
2234
2235
2236
2237
2238static int do_epoll_wait(int epfd, struct epoll_event __user *events,
2239 int maxevents, int timeout)
2240{
2241 int error;
2242 struct fd f;
2243 struct eventpoll *ep;
2244
2245
2246 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
2247 return -EINVAL;
2248
2249
2250 if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
2251 return -EFAULT;
2252
2253
2254 f = fdget(epfd);
2255 if (!f.file)
2256 return -EBADF;
2257
2258
2259
2260
2261
2262 error = -EINVAL;
2263 if (!is_file_epoll(f.file))
2264 goto error_fput;
2265
2266
2267
2268
2269
2270 ep = f.file->private_data;
2271
2272
2273 error = ep_poll(ep, events, maxevents, timeout);
2274
2275error_fput:
2276 fdput(f);
2277 return error;
2278}
2279
2280SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
2281 int, maxevents, int, timeout)
2282{
2283 return do_epoll_wait(epfd, events, maxevents, timeout);
2284}
2285
2286
2287
2288
2289
2290SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
2291 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
2292 size_t, sigsetsize)
2293{
2294 int error;
2295
2296
2297
2298
2299
2300 error = set_user_sigmask(sigmask, sigsetsize);
2301 if (error)
2302 return error;
2303
2304 error = do_epoll_wait(epfd, events, maxevents, timeout);
2305 restore_saved_sigmask_unless(error == -EINTR);
2306
2307 return error;
2308}
2309
2310#ifdef CONFIG_COMPAT
2311COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2312 struct epoll_event __user *, events,
2313 int, maxevents, int, timeout,
2314 const compat_sigset_t __user *, sigmask,
2315 compat_size_t, sigsetsize)
2316{
2317 long err;
2318
2319
2320
2321
2322
2323 err = set_compat_user_sigmask(sigmask, sigsetsize);
2324 if (err)
2325 return err;
2326
2327 err = do_epoll_wait(epfd, events, maxevents, timeout);
2328 restore_saved_sigmask_unless(err == -EINTR);
2329
2330 return err;
2331}
2332#endif
2333
2334static int __init eventpoll_init(void)
2335{
2336 struct sysinfo si;
2337
2338 si_meminfo(&si);
2339
2340
2341
2342 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
2343 EP_ITEM_COST;
2344 BUG_ON(max_user_watches < 0);
2345
2346
2347
2348
2349
2350 ep_nested_calls_init(&poll_loop_ncalls);
2351
2352
2353
2354
2355
2356 BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
2357
2358
2359 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
2360 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
2361
2362
2363 pwq_cache = kmem_cache_create("eventpoll_pwq",
2364 sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
2365
2366 return 0;
2367}
2368fs_initcall(eventpoll_init);
2369