1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/sched/signal.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/signal.h>
20#include <linux/errno.h>
21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/poll.h>
24#include <linux/string.h>
25#include <linux/list.h>
26#include <linux/hash.h>
27#include <linux/spinlock.h>
28#include <linux/syscalls.h>
29#include <linux/rbtree.h>
30#include <linux/wait.h>
31#include <linux/eventpoll.h>
32#include <linux/mount.h>
33#include <linux/bitops.h>
34#include <linux/mutex.h>
35#include <linux/anon_inodes.h>
36#include <linux/device.h>
37#include <linux/uaccess.h>
38#include <asm/io.h>
39#include <asm/mman.h>
40#include <linux/atomic.h>
41#include <linux/proc_fs.h>
42#include <linux/seq_file.h>
43#include <linux/compat.h>
44#include <linux/rculist.h>
45#include <net/busy_poll.h>
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
97
98#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
99
100#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
101 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
102
103
104#define EP_MAX_NESTS 4
105
106#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
107
108#define EP_UNACTIVE_PTR ((void *) -1L)
109
110#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
111
112struct epoll_filefd {
113 struct file *file;
114 int fd;
115} __packed;
116
117
118
119
120
121struct nested_call_node {
122 struct list_head llink;
123 void *cookie;
124 void *ctx;
125};
126
127
128
129
130
131struct nested_calls {
132 struct list_head tasks_call_list;
133 spinlock_t lock;
134};
135
136
137
138
139
140
141
142struct epitem {
143 union {
144
145 struct rb_node rbn;
146
147 struct rcu_head rcu;
148 };
149
150
151 struct list_head rdllink;
152
153
154
155
156
157 struct epitem *next;
158
159
160 struct epoll_filefd ffd;
161
162
163 int nwait;
164
165
166 struct list_head pwqlist;
167
168
169 struct eventpoll *ep;
170
171
172 struct list_head fllink;
173
174
175 struct wakeup_source __rcu *ws;
176
177
178 struct epoll_event event;
179};
180
181
182
183
184
185
186
187
188struct eventpoll {
189
190
191
192
193
194
195 struct mutex mtx;
196
197
198 wait_queue_head_t wq;
199
200
201 wait_queue_head_t poll_wait;
202
203
204 struct list_head rdllist;
205
206
207 struct rb_root_cached rbr;
208
209
210
211
212
213
214 struct epitem *ovflist;
215
216
217 struct wakeup_source *ws;
218
219
220 struct user_struct *user;
221
222 struct file *file;
223
224
225 int visited;
226 struct list_head visited_list_link;
227
228#ifdef CONFIG_NET_RX_BUSY_POLL
229
230 unsigned int napi_id;
231#endif
232};
233
234
235struct eppoll_entry {
236
237 struct list_head llink;
238
239
240 struct epitem *base;
241
242
243
244
245
246 wait_queue_entry_t wait;
247
248
249 wait_queue_head_t *whead;
250};
251
252
253struct ep_pqueue {
254 poll_table pt;
255 struct epitem *epi;
256};
257
258
259struct ep_send_events_data {
260 int maxevents;
261 struct epoll_event __user *events;
262 int res;
263};
264
265
266
267
268
269static long max_user_watches __read_mostly;
270
271
272
273
274static DEFINE_MUTEX(epmutex);
275
276
277static struct nested_calls poll_loop_ncalls;
278
279
280static struct kmem_cache *epi_cache __read_mostly;
281
282
283static struct kmem_cache *pwq_cache __read_mostly;
284
285
286static LIST_HEAD(visited_list);
287
288
289
290
291
292static LIST_HEAD(tfile_check_list);
293
294#ifdef CONFIG_SYSCTL
295
296#include <linux/sysctl.h>
297
298static long zero;
299static long long_max = LONG_MAX;
300
301struct ctl_table epoll_table[] = {
302 {
303 .procname = "max_user_watches",
304 .data = &max_user_watches,
305 .maxlen = sizeof(max_user_watches),
306 .mode = 0644,
307 .proc_handler = proc_doulongvec_minmax,
308 .extra1 = &zero,
309 .extra2 = &long_max,
310 },
311 { }
312};
313#endif
314
315static const struct file_operations eventpoll_fops;
316
317static inline int is_file_epoll(struct file *f)
318{
319 return f->f_op == &eventpoll_fops;
320}
321
322
323static inline void ep_set_ffd(struct epoll_filefd *ffd,
324 struct file *file, int fd)
325{
326 ffd->file = file;
327 ffd->fd = fd;
328}
329
330
331static inline int ep_cmp_ffd(struct epoll_filefd *p1,
332 struct epoll_filefd *p2)
333{
334 return (p1->file > p2->file ? +1:
335 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
336}
337
338
339static inline int ep_is_linked(struct epitem *epi)
340{
341 return !list_empty(&epi->rdllink);
342}
343
344static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
345{
346 return container_of(p, struct eppoll_entry, wait);
347}
348
349
350static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
351{
352 return container_of(p, struct eppoll_entry, wait)->base;
353}
354
355
356static inline struct epitem *ep_item_from_epqueue(poll_table *p)
357{
358 return container_of(p, struct ep_pqueue, pt)->epi;
359}
360
361
362static inline int ep_op_has_event(int op)
363{
364 return op != EPOLL_CTL_DEL;
365}
366
367
368static void ep_nested_calls_init(struct nested_calls *ncalls)
369{
370 INIT_LIST_HEAD(&ncalls->tasks_call_list);
371 spin_lock_init(&ncalls->lock);
372}
373
374
375
376
377
378
379
380
381
382static inline int ep_events_available(struct eventpoll *ep)
383{
384 return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
385}
386
387#ifdef CONFIG_NET_RX_BUSY_POLL
388static bool ep_busy_loop_end(void *p, unsigned long start_time)
389{
390 struct eventpoll *ep = p;
391
392 return ep_events_available(ep) || busy_loop_timeout(start_time);
393}
394
395
396
397
398
399
400
401static void ep_busy_loop(struct eventpoll *ep, int nonblock)
402{
403 unsigned int napi_id = READ_ONCE(ep->napi_id);
404
405 if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
406 napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
407}
408
409static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
410{
411 if (ep->napi_id)
412 ep->napi_id = 0;
413}
414
415
416
417
418static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
419{
420 struct eventpoll *ep;
421 unsigned int napi_id;
422 struct socket *sock;
423 struct sock *sk;
424 int err;
425
426 if (!net_busy_loop_on())
427 return;
428
429 sock = sock_from_file(epi->ffd.file, &err);
430 if (!sock)
431 return;
432
433 sk = sock->sk;
434 if (!sk)
435 return;
436
437 napi_id = READ_ONCE(sk->sk_napi_id);
438 ep = epi->ep;
439
440
441
442
443
444 if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
445 return;
446
447
448 ep->napi_id = napi_id;
449}
450
451#else
452
453static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
454{
455}
456
457static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
458{
459}
460
461static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
462{
463}
464
465#endif
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
484 int (*nproc)(void *, void *, int), void *priv,
485 void *cookie, void *ctx)
486{
487 int error, call_nests = 0;
488 unsigned long flags;
489 struct list_head *lsthead = &ncalls->tasks_call_list;
490 struct nested_call_node *tncur;
491 struct nested_call_node tnode;
492
493 spin_lock_irqsave(&ncalls->lock, flags);
494
495
496
497
498
499
500 list_for_each_entry(tncur, lsthead, llink) {
501 if (tncur->ctx == ctx &&
502 (tncur->cookie == cookie || ++call_nests > max_nests)) {
503
504
505
506
507 error = -1;
508 goto out_unlock;
509 }
510 }
511
512
513 tnode.ctx = ctx;
514 tnode.cookie = cookie;
515 list_add(&tnode.llink, lsthead);
516
517 spin_unlock_irqrestore(&ncalls->lock, flags);
518
519
520 error = (*nproc)(priv, cookie, call_nests);
521
522
523 spin_lock_irqsave(&ncalls->lock, flags);
524 list_del(&tnode.llink);
525out_unlock:
526 spin_unlock_irqrestore(&ncalls->lock, flags);
527
528 return error;
529}
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556#ifdef CONFIG_DEBUG_LOCK_ALLOC
557
558static struct nested_calls poll_safewake_ncalls;
559
560static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
561{
562 unsigned long flags;
563 wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
564
565 spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
566 wake_up_locked_poll(wqueue, EPOLLIN);
567 spin_unlock_irqrestore(&wqueue->lock, flags);
568
569 return 0;
570}
571
572static void ep_poll_safewake(wait_queue_head_t *wq)
573{
574 int this_cpu = get_cpu();
575
576 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
577 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
578
579 put_cpu();
580}
581
582#else
583
584static void ep_poll_safewake(wait_queue_head_t *wq)
585{
586 wake_up_poll(wq, EPOLLIN);
587}
588
589#endif
590
591static void ep_remove_wait_queue(struct eppoll_entry *pwq)
592{
593 wait_queue_head_t *whead;
594
595 rcu_read_lock();
596
597
598
599
600
601
602 whead = smp_load_acquire(&pwq->whead);
603 if (whead)
604 remove_wait_queue(whead, &pwq->wait);
605 rcu_read_unlock();
606}
607
608
609
610
611
612
613static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
614{
615 struct list_head *lsthead = &epi->pwqlist;
616 struct eppoll_entry *pwq;
617
618 while (!list_empty(lsthead)) {
619 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
620
621 list_del(&pwq->llink);
622 ep_remove_wait_queue(pwq);
623 kmem_cache_free(pwq_cache, pwq);
624 }
625}
626
627
628static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
629{
630 return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
631}
632
633
634static inline void ep_pm_stay_awake(struct epitem *epi)
635{
636 struct wakeup_source *ws = ep_wakeup_source(epi);
637
638 if (ws)
639 __pm_stay_awake(ws);
640}
641
642static inline bool ep_has_wakeup_source(struct epitem *epi)
643{
644 return rcu_access_pointer(epi->ws) ? true : false;
645}
646
647
648static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
649{
650 struct wakeup_source *ws;
651
652 rcu_read_lock();
653 ws = rcu_dereference(epi->ws);
654 if (ws)
655 __pm_stay_awake(ws);
656 rcu_read_unlock();
657}
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672static __poll_t ep_scan_ready_list(struct eventpoll *ep,
673 __poll_t (*sproc)(struct eventpoll *,
674 struct list_head *, void *),
675 void *priv, int depth, bool ep_locked)
676{
677 __poll_t res;
678 int pwake = 0;
679 struct epitem *epi, *nepi;
680 LIST_HEAD(txlist);
681
682 lockdep_assert_irqs_enabled();
683
684
685
686
687
688
689 if (!ep_locked)
690 mutex_lock_nested(&ep->mtx, depth);
691
692
693
694
695
696
697
698
699
700 spin_lock_irq(&ep->wq.lock);
701 list_splice_init(&ep->rdllist, &txlist);
702 ep->ovflist = NULL;
703 spin_unlock_irq(&ep->wq.lock);
704
705
706
707
708 res = (*sproc)(ep, &txlist, priv);
709
710 spin_lock_irq(&ep->wq.lock);
711
712
713
714
715
716 for (nepi = ep->ovflist; (epi = nepi) != NULL;
717 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
718
719
720
721
722
723
724 if (!ep_is_linked(epi)) {
725 list_add_tail(&epi->rdllink, &ep->rdllist);
726 ep_pm_stay_awake(epi);
727 }
728 }
729
730
731
732
733
734 ep->ovflist = EP_UNACTIVE_PTR;
735
736
737
738
739 list_splice(&txlist, &ep->rdllist);
740 __pm_relax(ep->ws);
741
742 if (!list_empty(&ep->rdllist)) {
743
744
745
746
747 if (waitqueue_active(&ep->wq))
748 wake_up_locked(&ep->wq);
749 if (waitqueue_active(&ep->poll_wait))
750 pwake++;
751 }
752 spin_unlock_irq(&ep->wq.lock);
753
754 if (!ep_locked)
755 mutex_unlock(&ep->mtx);
756
757
758 if (pwake)
759 ep_poll_safewake(&ep->poll_wait);
760
761 return res;
762}
763
764static void epi_rcu_free(struct rcu_head *head)
765{
766 struct epitem *epi = container_of(head, struct epitem, rcu);
767 kmem_cache_free(epi_cache, epi);
768}
769
770
771
772
773
774static int ep_remove(struct eventpoll *ep, struct epitem *epi)
775{
776 struct file *file = epi->ffd.file;
777
778 lockdep_assert_irqs_enabled();
779
780
781
782
783 ep_unregister_pollwait(ep, epi);
784
785
786 spin_lock(&file->f_lock);
787 list_del_rcu(&epi->fllink);
788 spin_unlock(&file->f_lock);
789
790 rb_erase_cached(&epi->rbn, &ep->rbr);
791
792 spin_lock_irq(&ep->wq.lock);
793 if (ep_is_linked(epi))
794 list_del_init(&epi->rdllink);
795 spin_unlock_irq(&ep->wq.lock);
796
797 wakeup_source_unregister(ep_wakeup_source(epi));
798
799
800
801
802
803
804
805 call_rcu(&epi->rcu, epi_rcu_free);
806
807 atomic_long_dec(&ep->user->epoll_watches);
808
809 return 0;
810}
811
812static void ep_free(struct eventpoll *ep)
813{
814 struct rb_node *rbp;
815 struct epitem *epi;
816
817
818 if (waitqueue_active(&ep->poll_wait))
819 ep_poll_safewake(&ep->poll_wait);
820
821
822
823
824
825
826
827
828
829 mutex_lock(&epmutex);
830
831
832
833
834 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
835 epi = rb_entry(rbp, struct epitem, rbn);
836
837 ep_unregister_pollwait(ep, epi);
838 cond_resched();
839 }
840
841
842
843
844
845
846
847
848
849 mutex_lock(&ep->mtx);
850 while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
851 epi = rb_entry(rbp, struct epitem, rbn);
852 ep_remove(ep, epi);
853 cond_resched();
854 }
855 mutex_unlock(&ep->mtx);
856
857 mutex_unlock(&epmutex);
858 mutex_destroy(&ep->mtx);
859 free_uid(ep->user);
860 wakeup_source_unregister(ep->ws);
861 kfree(ep);
862}
863
864static int ep_eventpoll_release(struct inode *inode, struct file *file)
865{
866 struct eventpoll *ep = file->private_data;
867
868 if (ep)
869 ep_free(ep);
870
871 return 0;
872}
873
874static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
875 void *priv);
876static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
877 poll_table *pt);
878
879
880
881
882
883
884static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
885 int depth)
886{
887 struct eventpoll *ep;
888 bool locked;
889
890 pt->_key = epi->event.events;
891 if (!is_file_epoll(epi->ffd.file))
892 return vfs_poll(epi->ffd.file, pt) & epi->event.events;
893
894 ep = epi->ffd.file->private_data;
895 poll_wait(epi->ffd.file, &ep->poll_wait, pt);
896 locked = pt && (pt->_qproc == ep_ptable_queue_proc);
897
898 return ep_scan_ready_list(epi->ffd.file->private_data,
899 ep_read_events_proc, &depth, depth,
900 locked) & epi->event.events;
901}
902
903static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
904 void *priv)
905{
906 struct epitem *epi, *tmp;
907 poll_table pt;
908 int depth = *(int *)priv;
909
910 init_poll_funcptr(&pt, NULL);
911 depth++;
912
913 list_for_each_entry_safe(epi, tmp, head, rdllink) {
914 if (ep_item_poll(epi, &pt, depth)) {
915 return EPOLLIN | EPOLLRDNORM;
916 } else {
917
918
919
920
921
922 __pm_relax(ep_wakeup_source(epi));
923 list_del_init(&epi->rdllink);
924 }
925 }
926
927 return 0;
928}
929
930static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
931{
932 struct eventpoll *ep = file->private_data;
933 int depth = 0;
934
935
936 poll_wait(file, &ep->poll_wait, wait);
937
938
939
940
941
942 return ep_scan_ready_list(ep, ep_read_events_proc,
943 &depth, depth, false);
944}
945
946#ifdef CONFIG_PROC_FS
947static void ep_show_fdinfo(struct seq_file *m, struct file *f)
948{
949 struct eventpoll *ep = f->private_data;
950 struct rb_node *rbp;
951
952 mutex_lock(&ep->mtx);
953 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
954 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
955 struct inode *inode = file_inode(epi->ffd.file);
956
957 seq_printf(m, "tfd: %8d events: %8x data: %16llx "
958 " pos:%lli ino:%lx sdev:%x\n",
959 epi->ffd.fd, epi->event.events,
960 (long long)epi->event.data,
961 (long long)epi->ffd.file->f_pos,
962 inode->i_ino, inode->i_sb->s_dev);
963 if (seq_has_overflowed(m))
964 break;
965 }
966 mutex_unlock(&ep->mtx);
967}
968#endif
969
970
971static const struct file_operations eventpoll_fops = {
972#ifdef CONFIG_PROC_FS
973 .show_fdinfo = ep_show_fdinfo,
974#endif
975 .release = ep_eventpoll_release,
976 .poll = ep_eventpoll_poll,
977 .llseek = noop_llseek,
978};
979
980
981
982
983
984
985void eventpoll_release_file(struct file *file)
986{
987 struct eventpoll *ep;
988 struct epitem *epi, *next;
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003 mutex_lock(&epmutex);
1004 list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
1005 ep = epi->ep;
1006 mutex_lock_nested(&ep->mtx, 0);
1007 ep_remove(ep, epi);
1008 mutex_unlock(&ep->mtx);
1009 }
1010 mutex_unlock(&epmutex);
1011}
1012
1013static int ep_alloc(struct eventpoll **pep)
1014{
1015 int error;
1016 struct user_struct *user;
1017 struct eventpoll *ep;
1018
1019 user = get_current_user();
1020 error = -ENOMEM;
1021 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
1022 if (unlikely(!ep))
1023 goto free_uid;
1024
1025 mutex_init(&ep->mtx);
1026 init_waitqueue_head(&ep->wq);
1027 init_waitqueue_head(&ep->poll_wait);
1028 INIT_LIST_HEAD(&ep->rdllist);
1029 ep->rbr = RB_ROOT_CACHED;
1030 ep->ovflist = EP_UNACTIVE_PTR;
1031 ep->user = user;
1032
1033 *pep = ep;
1034
1035 return 0;
1036
1037free_uid:
1038 free_uid(user);
1039 return error;
1040}
1041
1042
1043
1044
1045
1046
1047static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
1048{
1049 int kcmp;
1050 struct rb_node *rbp;
1051 struct epitem *epi, *epir = NULL;
1052 struct epoll_filefd ffd;
1053
1054 ep_set_ffd(&ffd, file, fd);
1055 for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
1056 epi = rb_entry(rbp, struct epitem, rbn);
1057 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
1058 if (kcmp > 0)
1059 rbp = rbp->rb_right;
1060 else if (kcmp < 0)
1061 rbp = rbp->rb_left;
1062 else {
1063 epir = epi;
1064 break;
1065 }
1066 }
1067
1068 return epir;
1069}
1070
1071#ifdef CONFIG_CHECKPOINT_RESTORE
1072static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
1073{
1074 struct rb_node *rbp;
1075 struct epitem *epi;
1076
1077 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1078 epi = rb_entry(rbp, struct epitem, rbn);
1079 if (epi->ffd.fd == tfd) {
1080 if (toff == 0)
1081 return epi;
1082 else
1083 toff--;
1084 }
1085 cond_resched();
1086 }
1087
1088 return NULL;
1089}
1090
1091struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
1092 unsigned long toff)
1093{
1094 struct file *file_raw;
1095 struct eventpoll *ep;
1096 struct epitem *epi;
1097
1098 if (!is_file_epoll(file))
1099 return ERR_PTR(-EINVAL);
1100
1101 ep = file->private_data;
1102
1103 mutex_lock(&ep->mtx);
1104 epi = ep_find_tfd(ep, tfd, toff);
1105 if (epi)
1106 file_raw = epi->ffd.file;
1107 else
1108 file_raw = ERR_PTR(-ENOENT);
1109 mutex_unlock(&ep->mtx);
1110
1111 return file_raw;
1112}
1113#endif
1114
1115
1116
1117
1118
1119
1120static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
1121{
1122 int pwake = 0;
1123 unsigned long flags;
1124 struct epitem *epi = ep_item_from_wait(wait);
1125 struct eventpoll *ep = epi->ep;
1126 __poll_t pollflags = key_to_poll(key);
1127 int ewake = 0;
1128
1129 spin_lock_irqsave(&ep->wq.lock, flags);
1130
1131 ep_set_busy_poll_napi_id(epi);
1132
1133
1134
1135
1136
1137
1138
1139 if (!(epi->event.events & ~EP_PRIVATE_BITS))
1140 goto out_unlock;
1141
1142
1143
1144
1145
1146
1147
1148 if (pollflags && !(pollflags & epi->event.events))
1149 goto out_unlock;
1150
1151
1152
1153
1154
1155
1156
1157 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
1158 if (epi->next == EP_UNACTIVE_PTR) {
1159 epi->next = ep->ovflist;
1160 ep->ovflist = epi;
1161 if (epi->ws) {
1162
1163
1164
1165
1166 __pm_stay_awake(ep->ws);
1167 }
1168
1169 }
1170 goto out_unlock;
1171 }
1172
1173
1174 if (!ep_is_linked(epi)) {
1175 list_add_tail(&epi->rdllink, &ep->rdllist);
1176 ep_pm_stay_awake_rcu(epi);
1177 }
1178
1179
1180
1181
1182
1183 if (waitqueue_active(&ep->wq)) {
1184 if ((epi->event.events & EPOLLEXCLUSIVE) &&
1185 !(pollflags & POLLFREE)) {
1186 switch (pollflags & EPOLLINOUT_BITS) {
1187 case EPOLLIN:
1188 if (epi->event.events & EPOLLIN)
1189 ewake = 1;
1190 break;
1191 case EPOLLOUT:
1192 if (epi->event.events & EPOLLOUT)
1193 ewake = 1;
1194 break;
1195 case 0:
1196 ewake = 1;
1197 break;
1198 }
1199 }
1200 wake_up_locked(&ep->wq);
1201 }
1202 if (waitqueue_active(&ep->poll_wait))
1203 pwake++;
1204
1205out_unlock:
1206 spin_unlock_irqrestore(&ep->wq.lock, flags);
1207
1208
1209 if (pwake)
1210 ep_poll_safewake(&ep->poll_wait);
1211
1212 if (!(epi->event.events & EPOLLEXCLUSIVE))
1213 ewake = 1;
1214
1215 if (pollflags & POLLFREE) {
1216
1217
1218
1219
1220
1221 list_del_init(&wait->entry);
1222
1223
1224
1225
1226
1227
1228 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
1229 }
1230
1231 return ewake;
1232}
1233
1234
1235
1236
1237
1238static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
1239 poll_table *pt)
1240{
1241 struct epitem *epi = ep_item_from_epqueue(pt);
1242 struct eppoll_entry *pwq;
1243
1244 if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
1245 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
1246 pwq->whead = whead;
1247 pwq->base = epi;
1248 if (epi->event.events & EPOLLEXCLUSIVE)
1249 add_wait_queue_exclusive(whead, &pwq->wait);
1250 else
1251 add_wait_queue(whead, &pwq->wait);
1252 list_add_tail(&pwq->llink, &epi->pwqlist);
1253 epi->nwait++;
1254 } else {
1255
1256 epi->nwait = -1;
1257 }
1258}
1259
1260static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1261{
1262 int kcmp;
1263 struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
1264 struct epitem *epic;
1265 bool leftmost = true;
1266
1267 while (*p) {
1268 parent = *p;
1269 epic = rb_entry(parent, struct epitem, rbn);
1270 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1271 if (kcmp > 0) {
1272 p = &parent->rb_right;
1273 leftmost = false;
1274 } else
1275 p = &parent->rb_left;
1276 }
1277 rb_link_node(&epi->rbn, parent, p);
1278 rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
1279}
1280
1281
1282
1283#define PATH_ARR_SIZE 5
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
1296static int path_count[PATH_ARR_SIZE];
1297
1298static int path_count_inc(int nests)
1299{
1300
1301 if (nests == 0)
1302 return 0;
1303
1304 if (++path_count[nests] > path_limits[nests])
1305 return -1;
1306 return 0;
1307}
1308
1309static void path_count_init(void)
1310{
1311 int i;
1312
1313 for (i = 0; i < PATH_ARR_SIZE; i++)
1314 path_count[i] = 0;
1315}
1316
1317static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1318{
1319 int error = 0;
1320 struct file *file = priv;
1321 struct file *child_file;
1322 struct epitem *epi;
1323
1324
1325 rcu_read_lock();
1326 list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
1327 child_file = epi->ep->file;
1328 if (is_file_epoll(child_file)) {
1329 if (list_empty(&child_file->f_ep_links)) {
1330 if (path_count_inc(call_nests)) {
1331 error = -1;
1332 break;
1333 }
1334 } else {
1335 error = ep_call_nested(&poll_loop_ncalls,
1336 EP_MAX_NESTS,
1337 reverse_path_check_proc,
1338 child_file, child_file,
1339 current);
1340 }
1341 if (error != 0)
1342 break;
1343 } else {
1344 printk(KERN_ERR "reverse_path_check_proc: "
1345 "file is not an ep!\n");
1346 }
1347 }
1348 rcu_read_unlock();
1349 return error;
1350}
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362static int reverse_path_check(void)
1363{
1364 int error = 0;
1365 struct file *current_file;
1366
1367
1368 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1369 path_count_init();
1370 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1371 reverse_path_check_proc, current_file,
1372 current_file, current);
1373 if (error)
1374 break;
1375 }
1376 return error;
1377}
1378
1379static int ep_create_wakeup_source(struct epitem *epi)
1380{
1381 const char *name;
1382 struct wakeup_source *ws;
1383
1384 if (!epi->ep->ws) {
1385 epi->ep->ws = wakeup_source_register("eventpoll");
1386 if (!epi->ep->ws)
1387 return -ENOMEM;
1388 }
1389
1390 name = epi->ffd.file->f_path.dentry->d_name.name;
1391 ws = wakeup_source_register(name);
1392
1393 if (!ws)
1394 return -ENOMEM;
1395 rcu_assign_pointer(epi->ws, ws);
1396
1397 return 0;
1398}
1399
1400
1401static noinline void ep_destroy_wakeup_source(struct epitem *epi)
1402{
1403 struct wakeup_source *ws = ep_wakeup_source(epi);
1404
1405 RCU_INIT_POINTER(epi->ws, NULL);
1406
1407
1408
1409
1410
1411
1412 synchronize_rcu();
1413 wakeup_source_unregister(ws);
1414}
1415
1416
1417
1418
1419static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
1420 struct file *tfile, int fd, int full_check)
1421{
1422 int error, pwake = 0;
1423 __poll_t revents;
1424 long user_watches;
1425 struct epitem *epi;
1426 struct ep_pqueue epq;
1427
1428 lockdep_assert_irqs_enabled();
1429
1430 user_watches = atomic_long_read(&ep->user->epoll_watches);
1431 if (unlikely(user_watches >= max_user_watches))
1432 return -ENOSPC;
1433 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
1434 return -ENOMEM;
1435
1436
1437 INIT_LIST_HEAD(&epi->rdllink);
1438 INIT_LIST_HEAD(&epi->fllink);
1439 INIT_LIST_HEAD(&epi->pwqlist);
1440 epi->ep = ep;
1441 ep_set_ffd(&epi->ffd, tfile, fd);
1442 epi->event = *event;
1443 epi->nwait = 0;
1444 epi->next = EP_UNACTIVE_PTR;
1445 if (epi->event.events & EPOLLWAKEUP) {
1446 error = ep_create_wakeup_source(epi);
1447 if (error)
1448 goto error_create_wakeup_source;
1449 } else {
1450 RCU_INIT_POINTER(epi->ws, NULL);
1451 }
1452
1453
1454 epq.epi = epi;
1455 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1456
1457
1458
1459
1460
1461
1462
1463
1464 revents = ep_item_poll(epi, &epq.pt, 1);
1465
1466
1467
1468
1469
1470
1471 error = -ENOMEM;
1472 if (epi->nwait < 0)
1473 goto error_unregister;
1474
1475
1476 spin_lock(&tfile->f_lock);
1477 list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
1478 spin_unlock(&tfile->f_lock);
1479
1480
1481
1482
1483
1484 ep_rbtree_insert(ep, epi);
1485
1486
1487 error = -EINVAL;
1488 if (full_check && reverse_path_check())
1489 goto error_remove_epi;
1490
1491
1492 spin_lock_irq(&ep->wq.lock);
1493
1494
1495 ep_set_busy_poll_napi_id(epi);
1496
1497
1498 if (revents && !ep_is_linked(epi)) {
1499 list_add_tail(&epi->rdllink, &ep->rdllist);
1500 ep_pm_stay_awake(epi);
1501
1502
1503 if (waitqueue_active(&ep->wq))
1504 wake_up_locked(&ep->wq);
1505 if (waitqueue_active(&ep->poll_wait))
1506 pwake++;
1507 }
1508
1509 spin_unlock_irq(&ep->wq.lock);
1510
1511 atomic_long_inc(&ep->user->epoll_watches);
1512
1513
1514 if (pwake)
1515 ep_poll_safewake(&ep->poll_wait);
1516
1517 return 0;
1518
1519error_remove_epi:
1520 spin_lock(&tfile->f_lock);
1521 list_del_rcu(&epi->fllink);
1522 spin_unlock(&tfile->f_lock);
1523
1524 rb_erase_cached(&epi->rbn, &ep->rbr);
1525
1526error_unregister:
1527 ep_unregister_pollwait(ep, epi);
1528
1529
1530
1531
1532
1533
1534
1535 spin_lock_irq(&ep->wq.lock);
1536 if (ep_is_linked(epi))
1537 list_del_init(&epi->rdllink);
1538 spin_unlock_irq(&ep->wq.lock);
1539
1540 wakeup_source_unregister(ep_wakeup_source(epi));
1541
1542error_create_wakeup_source:
1543 kmem_cache_free(epi_cache, epi);
1544
1545 return error;
1546}
1547
1548
1549
1550
1551
1552static int ep_modify(struct eventpoll *ep, struct epitem *epi,
1553 const struct epoll_event *event)
1554{
1555 int pwake = 0;
1556 poll_table pt;
1557
1558 lockdep_assert_irqs_enabled();
1559
1560 init_poll_funcptr(&pt, NULL);
1561
1562
1563
1564
1565
1566
1567 epi->event.events = event->events;
1568 epi->event.data = event->data;
1569 if (epi->event.events & EPOLLWAKEUP) {
1570 if (!ep_has_wakeup_source(epi))
1571 ep_create_wakeup_source(epi);
1572 } else if (ep_has_wakeup_source(epi)) {
1573 ep_destroy_wakeup_source(epi);
1574 }
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594 smp_mb();
1595
1596
1597
1598
1599
1600
1601
1602 if (ep_item_poll(epi, &pt, 1)) {
1603 spin_lock_irq(&ep->wq.lock);
1604 if (!ep_is_linked(epi)) {
1605 list_add_tail(&epi->rdllink, &ep->rdllist);
1606 ep_pm_stay_awake(epi);
1607
1608
1609 if (waitqueue_active(&ep->wq))
1610 wake_up_locked(&ep->wq);
1611 if (waitqueue_active(&ep->poll_wait))
1612 pwake++;
1613 }
1614 spin_unlock_irq(&ep->wq.lock);
1615 }
1616
1617
1618 if (pwake)
1619 ep_poll_safewake(&ep->poll_wait);
1620
1621 return 0;
1622}
1623
1624static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1625 void *priv)
1626{
1627 struct ep_send_events_data *esed = priv;
1628 __poll_t revents;
1629 struct epitem *epi;
1630 struct epoll_event __user *uevent;
1631 struct wakeup_source *ws;
1632 poll_table pt;
1633
1634 init_poll_funcptr(&pt, NULL);
1635
1636
1637
1638
1639
1640
1641 for (esed->res = 0, uevent = esed->events;
1642 !list_empty(head) && esed->res < esed->maxevents;) {
1643 epi = list_first_entry(head, struct epitem, rdllink);
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654 ws = ep_wakeup_source(epi);
1655 if (ws) {
1656 if (ws->active)
1657 __pm_stay_awake(ep->ws);
1658 __pm_relax(ws);
1659 }
1660
1661 list_del_init(&epi->rdllink);
1662
1663 revents = ep_item_poll(epi, &pt, 1);
1664
1665
1666
1667
1668
1669
1670
1671 if (revents) {
1672 if (__put_user(revents, &uevent->events) ||
1673 __put_user(epi->event.data, &uevent->data)) {
1674 list_add(&epi->rdllink, head);
1675 ep_pm_stay_awake(epi);
1676 if (!esed->res)
1677 esed->res = -EFAULT;
1678 return 0;
1679 }
1680 esed->res++;
1681 uevent++;
1682 if (epi->event.events & EPOLLONESHOT)
1683 epi->event.events &= EP_PRIVATE_BITS;
1684 else if (!(epi->event.events & EPOLLET)) {
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696 list_add_tail(&epi->rdllink, &ep->rdllist);
1697 ep_pm_stay_awake(epi);
1698 }
1699 }
1700 }
1701
1702 return 0;
1703}
1704
1705static int ep_send_events(struct eventpoll *ep,
1706 struct epoll_event __user *events, int maxevents)
1707{
1708 struct ep_send_events_data esed;
1709
1710 esed.maxevents = maxevents;
1711 esed.events = events;
1712
1713 ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
1714 return esed.res;
1715}
1716
1717static inline struct timespec64 ep_set_mstimeout(long ms)
1718{
1719 struct timespec64 now, ts = {
1720 .tv_sec = ms / MSEC_PER_SEC,
1721 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1722 };
1723
1724 ktime_get_ts64(&now);
1725 return timespec64_add_safe(now, ts);
1726}
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1746 int maxevents, long timeout)
1747{
1748 int res = 0, eavail, timed_out = 0;
1749 u64 slack = 0;
1750 wait_queue_entry_t wait;
1751 ktime_t expires, *to = NULL;
1752
1753 lockdep_assert_irqs_enabled();
1754
1755 if (timeout > 0) {
1756 struct timespec64 end_time = ep_set_mstimeout(timeout);
1757
1758 slack = select_estimate_accuracy(&end_time);
1759 to = &expires;
1760 *to = timespec64_to_ktime(end_time);
1761 } else if (timeout == 0) {
1762
1763
1764
1765
1766 timed_out = 1;
1767 spin_lock_irq(&ep->wq.lock);
1768 goto check_events;
1769 }
1770
1771fetch_events:
1772
1773 if (!ep_events_available(ep))
1774 ep_busy_loop(ep, timed_out);
1775
1776 spin_lock_irq(&ep->wq.lock);
1777
1778 if (!ep_events_available(ep)) {
1779
1780
1781
1782
1783
1784 ep_reset_busy_poll_napi_id(ep);
1785
1786
1787
1788
1789
1790
1791 init_waitqueue_entry(&wait, current);
1792 __add_wait_queue_exclusive(&ep->wq, &wait);
1793
1794 for (;;) {
1795
1796
1797
1798
1799
1800 set_current_state(TASK_INTERRUPTIBLE);
1801
1802
1803
1804
1805
1806
1807 if (fatal_signal_pending(current)) {
1808 res = -EINTR;
1809 break;
1810 }
1811 if (ep_events_available(ep) || timed_out)
1812 break;
1813 if (signal_pending(current)) {
1814 res = -EINTR;
1815 break;
1816 }
1817
1818 spin_unlock_irq(&ep->wq.lock);
1819 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1820 timed_out = 1;
1821
1822 spin_lock_irq(&ep->wq.lock);
1823 }
1824
1825 __remove_wait_queue(&ep->wq, &wait);
1826 __set_current_state(TASK_RUNNING);
1827 }
1828check_events:
1829
1830 eavail = ep_events_available(ep);
1831
1832 spin_unlock_irq(&ep->wq.lock);
1833
1834
1835
1836
1837
1838
1839 if (!res && eavail &&
1840 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1841 goto fetch_events;
1842
1843 return res;
1844}
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1862{
1863 int error = 0;
1864 struct file *file = priv;
1865 struct eventpoll *ep = file->private_data;
1866 struct eventpoll *ep_tovisit;
1867 struct rb_node *rbp;
1868 struct epitem *epi;
1869
1870 mutex_lock_nested(&ep->mtx, call_nests + 1);
1871 ep->visited = 1;
1872 list_add(&ep->visited_list_link, &visited_list);
1873 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1874 epi = rb_entry(rbp, struct epitem, rbn);
1875 if (unlikely(is_file_epoll(epi->ffd.file))) {
1876 ep_tovisit = epi->ffd.file->private_data;
1877 if (ep_tovisit->visited)
1878 continue;
1879 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1880 ep_loop_check_proc, epi->ffd.file,
1881 ep_tovisit, current);
1882 if (error != 0)
1883 break;
1884 } else {
1885
1886
1887
1888
1889
1890
1891
1892
1893 if (list_empty(&epi->ffd.file->f_tfile_llink))
1894 list_add(&epi->ffd.file->f_tfile_llink,
1895 &tfile_check_list);
1896 }
1897 }
1898 mutex_unlock(&ep->mtx);
1899
1900 return error;
1901}
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914static int ep_loop_check(struct eventpoll *ep, struct file *file)
1915{
1916 int ret;
1917 struct eventpoll *ep_cur, *ep_next;
1918
1919 ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1920 ep_loop_check_proc, file, ep, current);
1921
1922 list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1923 visited_list_link) {
1924 ep_cur->visited = 0;
1925 list_del(&ep_cur->visited_list_link);
1926 }
1927 return ret;
1928}
1929
1930static void clear_tfile_check_list(void)
1931{
1932 struct file *file;
1933
1934
1935 while (!list_empty(&tfile_check_list)) {
1936 file = list_first_entry(&tfile_check_list, struct file,
1937 f_tfile_llink);
1938 list_del_init(&file->f_tfile_llink);
1939 }
1940 INIT_LIST_HEAD(&tfile_check_list);
1941}
1942
1943
1944
1945
1946static int do_epoll_create(int flags)
1947{
1948 int error, fd;
1949 struct eventpoll *ep = NULL;
1950 struct file *file;
1951
1952
1953 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1954
1955 if (flags & ~EPOLL_CLOEXEC)
1956 return -EINVAL;
1957
1958
1959
1960 error = ep_alloc(&ep);
1961 if (error < 0)
1962 return error;
1963
1964
1965
1966
1967 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
1968 if (fd < 0) {
1969 error = fd;
1970 goto out_free_ep;
1971 }
1972 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1973 O_RDWR | (flags & O_CLOEXEC));
1974 if (IS_ERR(file)) {
1975 error = PTR_ERR(file);
1976 goto out_free_fd;
1977 }
1978 ep->file = file;
1979 fd_install(fd, file);
1980 return fd;
1981
1982out_free_fd:
1983 put_unused_fd(fd);
1984out_free_ep:
1985 ep_free(ep);
1986 return error;
1987}
1988
1989SYSCALL_DEFINE1(epoll_create1, int, flags)
1990{
1991 return do_epoll_create(flags);
1992}
1993
1994SYSCALL_DEFINE1(epoll_create, int, size)
1995{
1996 if (size <= 0)
1997 return -EINVAL;
1998
1999 return do_epoll_create(0);
2000}
2001
2002
2003
2004
2005
2006
2007SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
2008 struct epoll_event __user *, event)
2009{
2010 int error;
2011 int full_check = 0;
2012 struct fd f, tf;
2013 struct eventpoll *ep;
2014 struct epitem *epi;
2015 struct epoll_event epds;
2016 struct eventpoll *tep = NULL;
2017
2018 error = -EFAULT;
2019 if (ep_op_has_event(op) &&
2020 copy_from_user(&epds, event, sizeof(struct epoll_event)))
2021 goto error_return;
2022
2023 error = -EBADF;
2024 f = fdget(epfd);
2025 if (!f.file)
2026 goto error_return;
2027
2028
2029 tf = fdget(fd);
2030 if (!tf.file)
2031 goto error_fput;
2032
2033
2034 error = -EPERM;
2035 if (!file_can_poll(tf.file))
2036 goto error_tgt_fput;
2037
2038
2039 if (ep_op_has_event(op))
2040 ep_take_care_of_epollwakeup(&epds);
2041
2042
2043
2044
2045
2046
2047 error = -EINVAL;
2048 if (f.file == tf.file || !is_file_epoll(f.file))
2049 goto error_tgt_fput;
2050
2051
2052
2053
2054
2055
2056 if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
2057 if (op == EPOLL_CTL_MOD)
2058 goto error_tgt_fput;
2059 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
2060 (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
2061 goto error_tgt_fput;
2062 }
2063
2064
2065
2066
2067
2068 ep = f.file->private_data;
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085 mutex_lock_nested(&ep->mtx, 0);
2086 if (op == EPOLL_CTL_ADD) {
2087 if (!list_empty(&f.file->f_ep_links) ||
2088 is_file_epoll(tf.file)) {
2089 full_check = 1;
2090 mutex_unlock(&ep->mtx);
2091 mutex_lock(&epmutex);
2092 if (is_file_epoll(tf.file)) {
2093 error = -ELOOP;
2094 if (ep_loop_check(ep, tf.file) != 0) {
2095 clear_tfile_check_list();
2096 goto error_tgt_fput;
2097 }
2098 } else
2099 list_add(&tf.file->f_tfile_llink,
2100 &tfile_check_list);
2101 mutex_lock_nested(&ep->mtx, 0);
2102 if (is_file_epoll(tf.file)) {
2103 tep = tf.file->private_data;
2104 mutex_lock_nested(&tep->mtx, 1);
2105 }
2106 }
2107 }
2108
2109
2110
2111
2112
2113
2114 epi = ep_find(ep, tf.file, fd);
2115
2116 error = -EINVAL;
2117 switch (op) {
2118 case EPOLL_CTL_ADD:
2119 if (!epi) {
2120 epds.events |= EPOLLERR | EPOLLHUP;
2121 error = ep_insert(ep, &epds, tf.file, fd, full_check);
2122 } else
2123 error = -EEXIST;
2124 if (full_check)
2125 clear_tfile_check_list();
2126 break;
2127 case EPOLL_CTL_DEL:
2128 if (epi)
2129 error = ep_remove(ep, epi);
2130 else
2131 error = -ENOENT;
2132 break;
2133 case EPOLL_CTL_MOD:
2134 if (epi) {
2135 if (!(epi->event.events & EPOLLEXCLUSIVE)) {
2136 epds.events |= EPOLLERR | EPOLLHUP;
2137 error = ep_modify(ep, epi, &epds);
2138 }
2139 } else
2140 error = -ENOENT;
2141 break;
2142 }
2143 if (tep != NULL)
2144 mutex_unlock(&tep->mtx);
2145 mutex_unlock(&ep->mtx);
2146
2147error_tgt_fput:
2148 if (full_check)
2149 mutex_unlock(&epmutex);
2150
2151 fdput(tf);
2152error_fput:
2153 fdput(f);
2154error_return:
2155
2156 return error;
2157}
2158
2159
2160
2161
2162
2163static int do_epoll_wait(int epfd, struct epoll_event __user *events,
2164 int maxevents, int timeout)
2165{
2166 int error;
2167 struct fd f;
2168 struct eventpoll *ep;
2169
2170
2171 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
2172 return -EINVAL;
2173
2174
2175 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
2176 return -EFAULT;
2177
2178
2179 f = fdget(epfd);
2180 if (!f.file)
2181 return -EBADF;
2182
2183
2184
2185
2186
2187 error = -EINVAL;
2188 if (!is_file_epoll(f.file))
2189 goto error_fput;
2190
2191
2192
2193
2194
2195 ep = f.file->private_data;
2196
2197
2198 error = ep_poll(ep, events, maxevents, timeout);
2199
2200error_fput:
2201 fdput(f);
2202 return error;
2203}
2204
2205SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
2206 int, maxevents, int, timeout)
2207{
2208 return do_epoll_wait(epfd, events, maxevents, timeout);
2209}
2210
2211
2212
2213
2214
2215SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
2216 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
2217 size_t, sigsetsize)
2218{
2219 int error;
2220 sigset_t ksigmask, sigsaved;
2221
2222
2223
2224
2225
2226 if (sigmask) {
2227 if (sigsetsize != sizeof(sigset_t))
2228 return -EINVAL;
2229 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
2230 return -EFAULT;
2231 sigsaved = current->blocked;
2232 set_current_blocked(&ksigmask);
2233 }
2234
2235 error = do_epoll_wait(epfd, events, maxevents, timeout);
2236
2237
2238
2239
2240
2241
2242
2243 if (sigmask) {
2244 if (error == -EINTR) {
2245 memcpy(¤t->saved_sigmask, &sigsaved,
2246 sizeof(sigsaved));
2247 set_restore_sigmask();
2248 } else
2249 set_current_blocked(&sigsaved);
2250 }
2251
2252 return error;
2253}
2254
2255#ifdef CONFIG_COMPAT
2256COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2257 struct epoll_event __user *, events,
2258 int, maxevents, int, timeout,
2259 const compat_sigset_t __user *, sigmask,
2260 compat_size_t, sigsetsize)
2261{
2262 long err;
2263 sigset_t ksigmask, sigsaved;
2264
2265
2266
2267
2268
2269 if (sigmask) {
2270 if (sigsetsize != sizeof(compat_sigset_t))
2271 return -EINVAL;
2272 if (get_compat_sigset(&ksigmask, sigmask))
2273 return -EFAULT;
2274 sigsaved = current->blocked;
2275 set_current_blocked(&ksigmask);
2276 }
2277
2278 err = do_epoll_wait(epfd, events, maxevents, timeout);
2279
2280
2281
2282
2283
2284
2285
2286 if (sigmask) {
2287 if (err == -EINTR) {
2288 memcpy(¤t->saved_sigmask, &sigsaved,
2289 sizeof(sigsaved));
2290 set_restore_sigmask();
2291 } else
2292 set_current_blocked(&sigsaved);
2293 }
2294
2295 return err;
2296}
2297#endif
2298
2299static int __init eventpoll_init(void)
2300{
2301 struct sysinfo si;
2302
2303 si_meminfo(&si);
2304
2305
2306
2307 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
2308 EP_ITEM_COST;
2309 BUG_ON(max_user_watches < 0);
2310
2311
2312
2313
2314
2315 ep_nested_calls_init(&poll_loop_ncalls);
2316
2317#ifdef CONFIG_DEBUG_LOCK_ALLOC
2318
2319 ep_nested_calls_init(&poll_safewake_ncalls);
2320#endif
2321
2322
2323
2324
2325
2326 BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
2327
2328
2329 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
2330 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
2331
2332
2333 pwq_cache = kmem_cache_create("eventpoll_pwq",
2334 sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
2335
2336 return 0;
2337}
2338fs_initcall(eventpoll_init);
2339