1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/sched/signal.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/signal.h>
20#include <linux/errno.h>
21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/poll.h>
24#include <linux/string.h>
25#include <linux/list.h>
26#include <linux/hash.h>
27#include <linux/spinlock.h>
28#include <linux/syscalls.h>
29#include <linux/rbtree.h>
30#include <linux/wait.h>
31#include <linux/eventpoll.h>
32#include <linux/mount.h>
33#include <linux/bitops.h>
34#include <linux/mutex.h>
35#include <linux/anon_inodes.h>
36#include <linux/device.h>
37#include <linux/uaccess.h>
38#include <asm/io.h>
39#include <asm/mman.h>
40#include <linux/atomic.h>
41#include <linux/proc_fs.h>
42#include <linux/seq_file.h>
43#include <linux/compat.h>
44#include <linux/rculist.h>
45#include <net/busy_poll.h>
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
97
98#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
99
100#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
101 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
102
103
104#define EP_MAX_NESTS 4
105
106#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
107
108#define EP_UNACTIVE_PTR ((void *) -1L)
109
110#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
111
112struct epoll_filefd {
113 struct file *file;
114 int fd;
115} __packed;
116
117
118
119
120
121struct nested_call_node {
122 struct list_head llink;
123 void *cookie;
124 void *ctx;
125};
126
127
128
129
130
131struct nested_calls {
132 struct list_head tasks_call_list;
133 spinlock_t lock;
134};
135
136
137
138
139
140
141
142struct epitem {
143 union {
144
145 struct rb_node rbn;
146
147 struct rcu_head rcu;
148 };
149
150
151 struct list_head rdllink;
152
153
154
155
156
157 struct epitem *next;
158
159
160 struct epoll_filefd ffd;
161
162
163 int nwait;
164
165
166 struct list_head pwqlist;
167
168
169 struct eventpoll *ep;
170
171
172 struct list_head fllink;
173
174
175 struct wakeup_source __rcu *ws;
176
177
178 struct epoll_event event;
179};
180
181
182
183
184
185
186struct eventpoll {
187
188 spinlock_t lock;
189
190
191
192
193
194
195
196 struct mutex mtx;
197
198
199 wait_queue_head_t wq;
200
201
202 wait_queue_head_t poll_wait;
203
204
205 struct list_head rdllist;
206
207
208 struct rb_root_cached rbr;
209
210
211
212
213
214
215 struct epitem *ovflist;
216
217
218 struct wakeup_source *ws;
219
220
221 struct user_struct *user;
222
223 struct file *file;
224
225
226 int visited;
227 struct list_head visited_list_link;
228
229#ifdef CONFIG_NET_RX_BUSY_POLL
230
231 unsigned int napi_id;
232#endif
233};
234
235
236struct eppoll_entry {
237
238 struct list_head llink;
239
240
241 struct epitem *base;
242
243
244
245
246
247 wait_queue_entry_t wait;
248
249
250 wait_queue_head_t *whead;
251};
252
253
254struct ep_pqueue {
255 poll_table pt;
256 struct epitem *epi;
257};
258
259
260struct ep_send_events_data {
261 int maxevents;
262 struct epoll_event __user *events;
263};
264
265
266
267
268
269static long max_user_watches __read_mostly;
270
271
272
273
274static DEFINE_MUTEX(epmutex);
275
276
277static struct nested_calls poll_loop_ncalls;
278
279
280static struct nested_calls poll_safewake_ncalls;
281
282
283static struct nested_calls poll_readywalk_ncalls;
284
285
286static struct kmem_cache *epi_cache __read_mostly;
287
288
289static struct kmem_cache *pwq_cache __read_mostly;
290
291
292static LIST_HEAD(visited_list);
293
294
295
296
297
298static LIST_HEAD(tfile_check_list);
299
300#ifdef CONFIG_SYSCTL
301
302#include <linux/sysctl.h>
303
304static long zero;
305static long long_max = LONG_MAX;
306
307struct ctl_table epoll_table[] = {
308 {
309 .procname = "max_user_watches",
310 .data = &max_user_watches,
311 .maxlen = sizeof(max_user_watches),
312 .mode = 0644,
313 .proc_handler = proc_doulongvec_minmax,
314 .extra1 = &zero,
315 .extra2 = &long_max,
316 },
317 { }
318};
319#endif
320
321static const struct file_operations eventpoll_fops;
322
323static inline int is_file_epoll(struct file *f)
324{
325 return f->f_op == &eventpoll_fops;
326}
327
328
329static inline void ep_set_ffd(struct epoll_filefd *ffd,
330 struct file *file, int fd)
331{
332 ffd->file = file;
333 ffd->fd = fd;
334}
335
336
337static inline int ep_cmp_ffd(struct epoll_filefd *p1,
338 struct epoll_filefd *p2)
339{
340 return (p1->file > p2->file ? +1:
341 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
342}
343
344
345static inline int ep_is_linked(struct list_head *p)
346{
347 return !list_empty(p);
348}
349
350static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
351{
352 return container_of(p, struct eppoll_entry, wait);
353}
354
355
356static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
357{
358 return container_of(p, struct eppoll_entry, wait)->base;
359}
360
361
362static inline struct epitem *ep_item_from_epqueue(poll_table *p)
363{
364 return container_of(p, struct ep_pqueue, pt)->epi;
365}
366
367
368static inline int ep_op_has_event(int op)
369{
370 return op != EPOLL_CTL_DEL;
371}
372
373
374static void ep_nested_calls_init(struct nested_calls *ncalls)
375{
376 INIT_LIST_HEAD(&ncalls->tasks_call_list);
377 spin_lock_init(&ncalls->lock);
378}
379
380
381
382
383
384
385
386
387
388static inline int ep_events_available(struct eventpoll *ep)
389{
390 return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
391}
392
393#ifdef CONFIG_NET_RX_BUSY_POLL
394static bool ep_busy_loop_end(void *p, unsigned long start_time)
395{
396 struct eventpoll *ep = p;
397
398 return ep_events_available(ep) || busy_loop_timeout(start_time);
399}
400#endif
401
402
403
404
405
406
407
408static void ep_busy_loop(struct eventpoll *ep, int nonblock)
409{
410#ifdef CONFIG_NET_RX_BUSY_POLL
411 unsigned int napi_id = READ_ONCE(ep->napi_id);
412
413 if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
414 napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
415#endif
416}
417
418static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
419{
420#ifdef CONFIG_NET_RX_BUSY_POLL
421 if (ep->napi_id)
422 ep->napi_id = 0;
423#endif
424}
425
426
427
428
429static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
430{
431#ifdef CONFIG_NET_RX_BUSY_POLL
432 struct eventpoll *ep;
433 unsigned int napi_id;
434 struct socket *sock;
435 struct sock *sk;
436 int err;
437
438 if (!net_busy_loop_on())
439 return;
440
441 sock = sock_from_file(epi->ffd.file, &err);
442 if (!sock)
443 return;
444
445 sk = sock->sk;
446 if (!sk)
447 return;
448
449 napi_id = READ_ONCE(sk->sk_napi_id);
450 ep = epi->ep;
451
452
453
454
455
456 if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
457 return;
458
459
460 ep->napi_id = napi_id;
461#endif
462}
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
481 int (*nproc)(void *, void *, int), void *priv,
482 void *cookie, void *ctx)
483{
484 int error, call_nests = 0;
485 unsigned long flags;
486 struct list_head *lsthead = &ncalls->tasks_call_list;
487 struct nested_call_node *tncur;
488 struct nested_call_node tnode;
489
490 spin_lock_irqsave(&ncalls->lock, flags);
491
492
493
494
495
496
497 list_for_each_entry(tncur, lsthead, llink) {
498 if (tncur->ctx == ctx &&
499 (tncur->cookie == cookie || ++call_nests > max_nests)) {
500
501
502
503
504 error = -1;
505 goto out_unlock;
506 }
507 }
508
509
510 tnode.ctx = ctx;
511 tnode.cookie = cookie;
512 list_add(&tnode.llink, lsthead);
513
514 spin_unlock_irqrestore(&ncalls->lock, flags);
515
516
517 error = (*nproc)(priv, cookie, call_nests);
518
519
520 spin_lock_irqsave(&ncalls->lock, flags);
521 list_del(&tnode.llink);
522out_unlock:
523 spin_unlock_irqrestore(&ncalls->lock, flags);
524
525 return error;
526}
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553#ifdef CONFIG_DEBUG_LOCK_ALLOC
554static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
555 unsigned long events, int subclass)
556{
557 unsigned long flags;
558
559 spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
560 wake_up_locked_poll(wqueue, events);
561 spin_unlock_irqrestore(&wqueue->lock, flags);
562}
563#else
564static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
565 unsigned long events, int subclass)
566{
567 wake_up_poll(wqueue, events);
568}
569#endif
570
571static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
572{
573 ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
574 1 + call_nests);
575 return 0;
576}
577
578
579
580
581
582
583
584
585
586
587
588static void ep_poll_safewake(wait_queue_head_t *wq)
589{
590 int this_cpu = get_cpu();
591
592 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
593 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
594
595 put_cpu();
596}
597
598static void ep_remove_wait_queue(struct eppoll_entry *pwq)
599{
600 wait_queue_head_t *whead;
601
602 rcu_read_lock();
603
604
605
606
607
608
609 whead = smp_load_acquire(&pwq->whead);
610 if (whead)
611 remove_wait_queue(whead, &pwq->wait);
612 rcu_read_unlock();
613}
614
615
616
617
618
619
620static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
621{
622 struct list_head *lsthead = &epi->pwqlist;
623 struct eppoll_entry *pwq;
624
625 while (!list_empty(lsthead)) {
626 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
627
628 list_del(&pwq->llink);
629 ep_remove_wait_queue(pwq);
630 kmem_cache_free(pwq_cache, pwq);
631 }
632}
633
634
635static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
636{
637 return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
638}
639
640
641static inline void ep_pm_stay_awake(struct epitem *epi)
642{
643 struct wakeup_source *ws = ep_wakeup_source(epi);
644
645 if (ws)
646 __pm_stay_awake(ws);
647}
648
649static inline bool ep_has_wakeup_source(struct epitem *epi)
650{
651 return rcu_access_pointer(epi->ws) ? true : false;
652}
653
654
655static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
656{
657 struct wakeup_source *ws;
658
659 rcu_read_lock();
660 ws = rcu_dereference(epi->ws);
661 if (ws)
662 __pm_stay_awake(ws);
663 rcu_read_unlock();
664}
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679static int ep_scan_ready_list(struct eventpoll *ep,
680 int (*sproc)(struct eventpoll *,
681 struct list_head *, void *),
682 void *priv, int depth, bool ep_locked)
683{
684 int error, pwake = 0;
685 unsigned long flags;
686 struct epitem *epi, *nepi;
687 LIST_HEAD(txlist);
688
689
690
691
692
693
694 if (!ep_locked)
695 mutex_lock_nested(&ep->mtx, depth);
696
697
698
699
700
701
702
703
704
705 spin_lock_irqsave(&ep->lock, flags);
706 list_splice_init(&ep->rdllist, &txlist);
707 ep->ovflist = NULL;
708 spin_unlock_irqrestore(&ep->lock, flags);
709
710
711
712
713 error = (*sproc)(ep, &txlist, priv);
714
715 spin_lock_irqsave(&ep->lock, flags);
716
717
718
719
720
721 for (nepi = ep->ovflist; (epi = nepi) != NULL;
722 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
723
724
725
726
727
728
729 if (!ep_is_linked(&epi->rdllink)) {
730 list_add_tail(&epi->rdllink, &ep->rdllist);
731 ep_pm_stay_awake(epi);
732 }
733 }
734
735
736
737
738
739 ep->ovflist = EP_UNACTIVE_PTR;
740
741
742
743
744 list_splice(&txlist, &ep->rdllist);
745 __pm_relax(ep->ws);
746
747 if (!list_empty(&ep->rdllist)) {
748
749
750
751
752 if (waitqueue_active(&ep->wq))
753 wake_up_locked(&ep->wq);
754 if (waitqueue_active(&ep->poll_wait))
755 pwake++;
756 }
757 spin_unlock_irqrestore(&ep->lock, flags);
758
759 if (!ep_locked)
760 mutex_unlock(&ep->mtx);
761
762
763 if (pwake)
764 ep_poll_safewake(&ep->poll_wait);
765
766 return error;
767}
768
769static void epi_rcu_free(struct rcu_head *head)
770{
771 struct epitem *epi = container_of(head, struct epitem, rcu);
772 kmem_cache_free(epi_cache, epi);
773}
774
775
776
777
778
779static int ep_remove(struct eventpoll *ep, struct epitem *epi)
780{
781 unsigned long flags;
782 struct file *file = epi->ffd.file;
783
784
785
786
787
788
789
790
791
792 ep_unregister_pollwait(ep, epi);
793
794
795 spin_lock(&file->f_lock);
796 list_del_rcu(&epi->fllink);
797 spin_unlock(&file->f_lock);
798
799 rb_erase_cached(&epi->rbn, &ep->rbr);
800
801 spin_lock_irqsave(&ep->lock, flags);
802 if (ep_is_linked(&epi->rdllink))
803 list_del_init(&epi->rdllink);
804 spin_unlock_irqrestore(&ep->lock, flags);
805
806 wakeup_source_unregister(ep_wakeup_source(epi));
807
808
809
810
811
812
813
814 call_rcu(&epi->rcu, epi_rcu_free);
815
816 atomic_long_dec(&ep->user->epoll_watches);
817
818 return 0;
819}
820
821static void ep_free(struct eventpoll *ep)
822{
823 struct rb_node *rbp;
824 struct epitem *epi;
825
826
827 if (waitqueue_active(&ep->poll_wait))
828 ep_poll_safewake(&ep->poll_wait);
829
830
831
832
833
834
835
836
837
838 mutex_lock(&epmutex);
839
840
841
842
843 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
844 epi = rb_entry(rbp, struct epitem, rbn);
845
846 ep_unregister_pollwait(ep, epi);
847 cond_resched();
848 }
849
850
851
852
853
854
855
856
857
858 mutex_lock(&ep->mtx);
859 while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
860 epi = rb_entry(rbp, struct epitem, rbn);
861 ep_remove(ep, epi);
862 cond_resched();
863 }
864 mutex_unlock(&ep->mtx);
865
866 mutex_unlock(&epmutex);
867 mutex_destroy(&ep->mtx);
868 free_uid(ep->user);
869 wakeup_source_unregister(ep->ws);
870 kfree(ep);
871}
872
873static int ep_eventpoll_release(struct inode *inode, struct file *file)
874{
875 struct eventpoll *ep = file->private_data;
876
877 if (ep)
878 ep_free(ep);
879
880 return 0;
881}
882
883static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
884{
885 pt->_key = epi->event.events;
886
887 return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
888}
889
890static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
891 void *priv)
892{
893 struct epitem *epi, *tmp;
894 poll_table pt;
895
896 init_poll_funcptr(&pt, NULL);
897
898 list_for_each_entry_safe(epi, tmp, head, rdllink) {
899 if (ep_item_poll(epi, &pt))
900 return POLLIN | POLLRDNORM;
901 else {
902
903
904
905
906
907 __pm_relax(ep_wakeup_source(epi));
908 list_del_init(&epi->rdllink);
909 }
910 }
911
912 return 0;
913}
914
915static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
916 poll_table *pt);
917
918struct readyevents_arg {
919 struct eventpoll *ep;
920 bool locked;
921};
922
923static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
924{
925 struct readyevents_arg *arg = priv;
926
927 return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
928 call_nests + 1, arg->locked);
929}
930
931static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
932{
933 int pollflags;
934 struct eventpoll *ep = file->private_data;
935 struct readyevents_arg arg;
936
937
938
939
940
941 arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
942 arg.ep = ep;
943
944
945 poll_wait(file, &ep->poll_wait, wait);
946
947
948
949
950
951
952
953 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
954 ep_poll_readyevents_proc, &arg, ep, current);
955
956 return pollflags != -1 ? pollflags : 0;
957}
958
959#ifdef CONFIG_PROC_FS
960static void ep_show_fdinfo(struct seq_file *m, struct file *f)
961{
962 struct eventpoll *ep = f->private_data;
963 struct rb_node *rbp;
964
965 mutex_lock(&ep->mtx);
966 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
967 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
968 struct inode *inode = file_inode(epi->ffd.file);
969
970 seq_printf(m, "tfd: %8d events: %8x data: %16llx "
971 " pos:%lli ino:%lx sdev:%x\n",
972 epi->ffd.fd, epi->event.events,
973 (long long)epi->event.data,
974 (long long)epi->ffd.file->f_pos,
975 inode->i_ino, inode->i_sb->s_dev);
976 if (seq_has_overflowed(m))
977 break;
978 }
979 mutex_unlock(&ep->mtx);
980}
981#endif
982
983
984static const struct file_operations eventpoll_fops = {
985#ifdef CONFIG_PROC_FS
986 .show_fdinfo = ep_show_fdinfo,
987#endif
988 .release = ep_eventpoll_release,
989 .poll = ep_eventpoll_poll,
990 .llseek = noop_llseek,
991};
992
993
994
995
996
997
998void eventpoll_release_file(struct file *file)
999{
1000 struct eventpoll *ep;
1001 struct epitem *epi, *next;
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016 mutex_lock(&epmutex);
1017 list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
1018 ep = epi->ep;
1019 mutex_lock_nested(&ep->mtx, 0);
1020 ep_remove(ep, epi);
1021 mutex_unlock(&ep->mtx);
1022 }
1023 mutex_unlock(&epmutex);
1024}
1025
1026static int ep_alloc(struct eventpoll **pep)
1027{
1028 int error;
1029 struct user_struct *user;
1030 struct eventpoll *ep;
1031
1032 user = get_current_user();
1033 error = -ENOMEM;
1034 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
1035 if (unlikely(!ep))
1036 goto free_uid;
1037
1038 spin_lock_init(&ep->lock);
1039 mutex_init(&ep->mtx);
1040 init_waitqueue_head(&ep->wq);
1041 init_waitqueue_head(&ep->poll_wait);
1042 INIT_LIST_HEAD(&ep->rdllist);
1043 ep->rbr = RB_ROOT_CACHED;
1044 ep->ovflist = EP_UNACTIVE_PTR;
1045 ep->user = user;
1046
1047 *pep = ep;
1048
1049 return 0;
1050
1051free_uid:
1052 free_uid(user);
1053 return error;
1054}
1055
1056
1057
1058
1059
1060
1061static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
1062{
1063 int kcmp;
1064 struct rb_node *rbp;
1065 struct epitem *epi, *epir = NULL;
1066 struct epoll_filefd ffd;
1067
1068 ep_set_ffd(&ffd, file, fd);
1069 for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
1070 epi = rb_entry(rbp, struct epitem, rbn);
1071 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
1072 if (kcmp > 0)
1073 rbp = rbp->rb_right;
1074 else if (kcmp < 0)
1075 rbp = rbp->rb_left;
1076 else {
1077 epir = epi;
1078 break;
1079 }
1080 }
1081
1082 return epir;
1083}
1084
1085#ifdef CONFIG_CHECKPOINT_RESTORE
1086static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
1087{
1088 struct rb_node *rbp;
1089 struct epitem *epi;
1090
1091 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1092 epi = rb_entry(rbp, struct epitem, rbn);
1093 if (epi->ffd.fd == tfd) {
1094 if (toff == 0)
1095 return epi;
1096 else
1097 toff--;
1098 }
1099 cond_resched();
1100 }
1101
1102 return NULL;
1103}
1104
1105struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
1106 unsigned long toff)
1107{
1108 struct file *file_raw;
1109 struct eventpoll *ep;
1110 struct epitem *epi;
1111
1112 if (!is_file_epoll(file))
1113 return ERR_PTR(-EINVAL);
1114
1115 ep = file->private_data;
1116
1117 mutex_lock(&ep->mtx);
1118 epi = ep_find_tfd(ep, tfd, toff);
1119 if (epi)
1120 file_raw = epi->ffd.file;
1121 else
1122 file_raw = ERR_PTR(-ENOENT);
1123 mutex_unlock(&ep->mtx);
1124
1125 return file_raw;
1126}
1127#endif
1128
1129
1130
1131
1132
1133
1134static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
1135{
1136 int pwake = 0;
1137 unsigned long flags;
1138 struct epitem *epi = ep_item_from_wait(wait);
1139 struct eventpoll *ep = epi->ep;
1140 int ewake = 0;
1141
1142 spin_lock_irqsave(&ep->lock, flags);
1143
1144 ep_set_busy_poll_napi_id(epi);
1145
1146
1147
1148
1149
1150
1151
1152 if (!(epi->event.events & ~EP_PRIVATE_BITS))
1153 goto out_unlock;
1154
1155
1156
1157
1158
1159
1160
1161 if (key && !((unsigned long) key & epi->event.events))
1162 goto out_unlock;
1163
1164
1165
1166
1167
1168
1169
1170 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
1171 if (epi->next == EP_UNACTIVE_PTR) {
1172 epi->next = ep->ovflist;
1173 ep->ovflist = epi;
1174 if (epi->ws) {
1175
1176
1177
1178
1179 __pm_stay_awake(ep->ws);
1180 }
1181
1182 }
1183 goto out_unlock;
1184 }
1185
1186
1187 if (!ep_is_linked(&epi->rdllink)) {
1188 list_add_tail(&epi->rdllink, &ep->rdllist);
1189 ep_pm_stay_awake_rcu(epi);
1190 }
1191
1192
1193
1194
1195
1196 if (waitqueue_active(&ep->wq)) {
1197 if ((epi->event.events & EPOLLEXCLUSIVE) &&
1198 !((unsigned long)key & POLLFREE)) {
1199 switch ((unsigned long)key & EPOLLINOUT_BITS) {
1200 case POLLIN:
1201 if (epi->event.events & POLLIN)
1202 ewake = 1;
1203 break;
1204 case POLLOUT:
1205 if (epi->event.events & POLLOUT)
1206 ewake = 1;
1207 break;
1208 case 0:
1209 ewake = 1;
1210 break;
1211 }
1212 }
1213 wake_up_locked(&ep->wq);
1214 }
1215 if (waitqueue_active(&ep->poll_wait))
1216 pwake++;
1217
1218out_unlock:
1219 spin_unlock_irqrestore(&ep->lock, flags);
1220
1221
1222 if (pwake)
1223 ep_poll_safewake(&ep->poll_wait);
1224
1225 if (!(epi->event.events & EPOLLEXCLUSIVE))
1226 ewake = 1;
1227
1228 if ((unsigned long)key & POLLFREE) {
1229
1230
1231
1232
1233
1234 list_del_init(&wait->entry);
1235
1236
1237
1238
1239
1240
1241 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
1242 }
1243
1244 return ewake;
1245}
1246
1247
1248
1249
1250
1251static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
1252 poll_table *pt)
1253{
1254 struct epitem *epi = ep_item_from_epqueue(pt);
1255 struct eppoll_entry *pwq;
1256
1257 if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
1258 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
1259 pwq->whead = whead;
1260 pwq->base = epi;
1261 if (epi->event.events & EPOLLEXCLUSIVE)
1262 add_wait_queue_exclusive(whead, &pwq->wait);
1263 else
1264 add_wait_queue(whead, &pwq->wait);
1265 list_add_tail(&pwq->llink, &epi->pwqlist);
1266 epi->nwait++;
1267 } else {
1268
1269 epi->nwait = -1;
1270 }
1271}
1272
1273static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1274{
1275 int kcmp;
1276 struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
1277 struct epitem *epic;
1278 bool leftmost = true;
1279
1280 while (*p) {
1281 parent = *p;
1282 epic = rb_entry(parent, struct epitem, rbn);
1283 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1284 if (kcmp > 0) {
1285 p = &parent->rb_right;
1286 leftmost = false;
1287 } else
1288 p = &parent->rb_left;
1289 }
1290 rb_link_node(&epi->rbn, parent, p);
1291 rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
1292}
1293
1294
1295
1296#define PATH_ARR_SIZE 5
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
1309static int path_count[PATH_ARR_SIZE];
1310
1311static int path_count_inc(int nests)
1312{
1313
1314 if (nests == 0)
1315 return 0;
1316
1317 if (++path_count[nests] > path_limits[nests])
1318 return -1;
1319 return 0;
1320}
1321
1322static void path_count_init(void)
1323{
1324 int i;
1325
1326 for (i = 0; i < PATH_ARR_SIZE; i++)
1327 path_count[i] = 0;
1328}
1329
1330static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1331{
1332 int error = 0;
1333 struct file *file = priv;
1334 struct file *child_file;
1335 struct epitem *epi;
1336
1337
1338 rcu_read_lock();
1339 list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
1340 child_file = epi->ep->file;
1341 if (is_file_epoll(child_file)) {
1342 if (list_empty(&child_file->f_ep_links)) {
1343 if (path_count_inc(call_nests)) {
1344 error = -1;
1345 break;
1346 }
1347 } else {
1348 error = ep_call_nested(&poll_loop_ncalls,
1349 EP_MAX_NESTS,
1350 reverse_path_check_proc,
1351 child_file, child_file,
1352 current);
1353 }
1354 if (error != 0)
1355 break;
1356 } else {
1357 printk(KERN_ERR "reverse_path_check_proc: "
1358 "file is not an ep!\n");
1359 }
1360 }
1361 rcu_read_unlock();
1362 return error;
1363}
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375static int reverse_path_check(void)
1376{
1377 int error = 0;
1378 struct file *current_file;
1379
1380
1381 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1382 path_count_init();
1383 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1384 reverse_path_check_proc, current_file,
1385 current_file, current);
1386 if (error)
1387 break;
1388 }
1389 return error;
1390}
1391
1392static int ep_create_wakeup_source(struct epitem *epi)
1393{
1394 const char *name;
1395 struct wakeup_source *ws;
1396
1397 if (!epi->ep->ws) {
1398 epi->ep->ws = wakeup_source_register("eventpoll");
1399 if (!epi->ep->ws)
1400 return -ENOMEM;
1401 }
1402
1403 name = epi->ffd.file->f_path.dentry->d_name.name;
1404 ws = wakeup_source_register(name);
1405
1406 if (!ws)
1407 return -ENOMEM;
1408 rcu_assign_pointer(epi->ws, ws);
1409
1410 return 0;
1411}
1412
1413
1414static noinline void ep_destroy_wakeup_source(struct epitem *epi)
1415{
1416 struct wakeup_source *ws = ep_wakeup_source(epi);
1417
1418 RCU_INIT_POINTER(epi->ws, NULL);
1419
1420
1421
1422
1423
1424
1425 synchronize_rcu();
1426 wakeup_source_unregister(ws);
1427}
1428
1429
1430
1431
1432static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1433 struct file *tfile, int fd, int full_check)
1434{
1435 int error, revents, pwake = 0;
1436 unsigned long flags;
1437 long user_watches;
1438 struct epitem *epi;
1439 struct ep_pqueue epq;
1440
1441 user_watches = atomic_long_read(&ep->user->epoll_watches);
1442 if (unlikely(user_watches >= max_user_watches))
1443 return -ENOSPC;
1444 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
1445 return -ENOMEM;
1446
1447
1448 INIT_LIST_HEAD(&epi->rdllink);
1449 INIT_LIST_HEAD(&epi->fllink);
1450 INIT_LIST_HEAD(&epi->pwqlist);
1451 epi->ep = ep;
1452 ep_set_ffd(&epi->ffd, tfile, fd);
1453 epi->event = *event;
1454 epi->nwait = 0;
1455 epi->next = EP_UNACTIVE_PTR;
1456 if (epi->event.events & EPOLLWAKEUP) {
1457 error = ep_create_wakeup_source(epi);
1458 if (error)
1459 goto error_create_wakeup_source;
1460 } else {
1461 RCU_INIT_POINTER(epi->ws, NULL);
1462 }
1463
1464
1465 epq.epi = epi;
1466 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1467
1468
1469
1470
1471
1472
1473
1474
1475 revents = ep_item_poll(epi, &epq.pt);
1476
1477
1478
1479
1480
1481
1482 error = -ENOMEM;
1483 if (epi->nwait < 0)
1484 goto error_unregister;
1485
1486
1487 spin_lock(&tfile->f_lock);
1488 list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
1489 spin_unlock(&tfile->f_lock);
1490
1491
1492
1493
1494
1495 ep_rbtree_insert(ep, epi);
1496
1497
1498 error = -EINVAL;
1499 if (full_check && reverse_path_check())
1500 goto error_remove_epi;
1501
1502
1503 spin_lock_irqsave(&ep->lock, flags);
1504
1505
1506 ep_set_busy_poll_napi_id(epi);
1507
1508
1509 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
1510 list_add_tail(&epi->rdllink, &ep->rdllist);
1511 ep_pm_stay_awake(epi);
1512
1513
1514 if (waitqueue_active(&ep->wq))
1515 wake_up_locked(&ep->wq);
1516 if (waitqueue_active(&ep->poll_wait))
1517 pwake++;
1518 }
1519
1520 spin_unlock_irqrestore(&ep->lock, flags);
1521
1522 atomic_long_inc(&ep->user->epoll_watches);
1523
1524
1525 if (pwake)
1526 ep_poll_safewake(&ep->poll_wait);
1527
1528 return 0;
1529
1530error_remove_epi:
1531 spin_lock(&tfile->f_lock);
1532 list_del_rcu(&epi->fllink);
1533 spin_unlock(&tfile->f_lock);
1534
1535 rb_erase_cached(&epi->rbn, &ep->rbr);
1536
1537error_unregister:
1538 ep_unregister_pollwait(ep, epi);
1539
1540
1541
1542
1543
1544
1545
1546 spin_lock_irqsave(&ep->lock, flags);
1547 if (ep_is_linked(&epi->rdllink))
1548 list_del_init(&epi->rdllink);
1549 spin_unlock_irqrestore(&ep->lock, flags);
1550
1551 wakeup_source_unregister(ep_wakeup_source(epi));
1552
1553error_create_wakeup_source:
1554 kmem_cache_free(epi_cache, epi);
1555
1556 return error;
1557}
1558
1559
1560
1561
1562
1563static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
1564{
1565 int pwake = 0;
1566 unsigned int revents;
1567 poll_table pt;
1568
1569 init_poll_funcptr(&pt, NULL);
1570
1571
1572
1573
1574
1575
1576 epi->event.events = event->events;
1577 epi->event.data = event->data;
1578 if (epi->event.events & EPOLLWAKEUP) {
1579 if (!ep_has_wakeup_source(epi))
1580 ep_create_wakeup_source(epi);
1581 } else if (ep_has_wakeup_source(epi)) {
1582 ep_destroy_wakeup_source(epi);
1583 }
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603 smp_mb();
1604
1605
1606
1607
1608
1609 revents = ep_item_poll(epi, &pt);
1610
1611
1612
1613
1614
1615 if (revents & event->events) {
1616 spin_lock_irq(&ep->lock);
1617 if (!ep_is_linked(&epi->rdllink)) {
1618 list_add_tail(&epi->rdllink, &ep->rdllist);
1619 ep_pm_stay_awake(epi);
1620
1621
1622 if (waitqueue_active(&ep->wq))
1623 wake_up_locked(&ep->wq);
1624 if (waitqueue_active(&ep->poll_wait))
1625 pwake++;
1626 }
1627 spin_unlock_irq(&ep->lock);
1628 }
1629
1630
1631 if (pwake)
1632 ep_poll_safewake(&ep->poll_wait);
1633
1634 return 0;
1635}
1636
1637static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1638 void *priv)
1639{
1640 struct ep_send_events_data *esed = priv;
1641 int eventcnt;
1642 unsigned int revents;
1643 struct epitem *epi;
1644 struct epoll_event __user *uevent;
1645 struct wakeup_source *ws;
1646 poll_table pt;
1647
1648 init_poll_funcptr(&pt, NULL);
1649
1650
1651
1652
1653
1654
1655 for (eventcnt = 0, uevent = esed->events;
1656 !list_empty(head) && eventcnt < esed->maxevents;) {
1657 epi = list_first_entry(head, struct epitem, rdllink);
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668 ws = ep_wakeup_source(epi);
1669 if (ws) {
1670 if (ws->active)
1671 __pm_stay_awake(ep->ws);
1672 __pm_relax(ws);
1673 }
1674
1675 list_del_init(&epi->rdllink);
1676
1677 revents = ep_item_poll(epi, &pt);
1678
1679
1680
1681
1682
1683
1684
1685 if (revents) {
1686 if (__put_user(revents, &uevent->events) ||
1687 __put_user(epi->event.data, &uevent->data)) {
1688 list_add(&epi->rdllink, head);
1689 ep_pm_stay_awake(epi);
1690 return eventcnt ? eventcnt : -EFAULT;
1691 }
1692 eventcnt++;
1693 uevent++;
1694 if (epi->event.events & EPOLLONESHOT)
1695 epi->event.events &= EP_PRIVATE_BITS;
1696 else if (!(epi->event.events & EPOLLET)) {
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708 list_add_tail(&epi->rdllink, &ep->rdllist);
1709 ep_pm_stay_awake(epi);
1710 }
1711 }
1712 }
1713
1714 return eventcnt;
1715}
1716
1717static int ep_send_events(struct eventpoll *ep,
1718 struct epoll_event __user *events, int maxevents)
1719{
1720 struct ep_send_events_data esed;
1721
1722 esed.maxevents = maxevents;
1723 esed.events = events;
1724
1725 return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
1726}
1727
1728static inline struct timespec64 ep_set_mstimeout(long ms)
1729{
1730 struct timespec64 now, ts = {
1731 .tv_sec = ms / MSEC_PER_SEC,
1732 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1733 };
1734
1735 ktime_get_ts64(&now);
1736 return timespec64_add_safe(now, ts);
1737}
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1757 int maxevents, long timeout)
1758{
1759 int res = 0, eavail, timed_out = 0;
1760 unsigned long flags;
1761 u64 slack = 0;
1762 wait_queue_entry_t wait;
1763 ktime_t expires, *to = NULL;
1764
1765 if (timeout > 0) {
1766 struct timespec64 end_time = ep_set_mstimeout(timeout);
1767
1768 slack = select_estimate_accuracy(&end_time);
1769 to = &expires;
1770 *to = timespec64_to_ktime(end_time);
1771 } else if (timeout == 0) {
1772
1773
1774
1775
1776 timed_out = 1;
1777 spin_lock_irqsave(&ep->lock, flags);
1778 goto check_events;
1779 }
1780
1781fetch_events:
1782
1783 if (!ep_events_available(ep))
1784 ep_busy_loop(ep, timed_out);
1785
1786 spin_lock_irqsave(&ep->lock, flags);
1787
1788 if (!ep_events_available(ep)) {
1789
1790
1791
1792
1793
1794 ep_reset_busy_poll_napi_id(ep);
1795
1796
1797
1798
1799
1800
1801 init_waitqueue_entry(&wait, current);
1802 __add_wait_queue_exclusive(&ep->wq, &wait);
1803
1804 for (;;) {
1805
1806
1807
1808
1809
1810 set_current_state(TASK_INTERRUPTIBLE);
1811
1812
1813
1814
1815
1816
1817 if (fatal_signal_pending(current)) {
1818 res = -EINTR;
1819 break;
1820 }
1821 if (ep_events_available(ep) || timed_out)
1822 break;
1823 if (signal_pending(current)) {
1824 res = -EINTR;
1825 break;
1826 }
1827
1828 spin_unlock_irqrestore(&ep->lock, flags);
1829 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1830 timed_out = 1;
1831
1832 spin_lock_irqsave(&ep->lock, flags);
1833 }
1834
1835 __remove_wait_queue(&ep->wq, &wait);
1836 __set_current_state(TASK_RUNNING);
1837 }
1838check_events:
1839
1840 eavail = ep_events_available(ep);
1841
1842 spin_unlock_irqrestore(&ep->lock, flags);
1843
1844
1845
1846
1847
1848
1849 if (!res && eavail &&
1850 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1851 goto fetch_events;
1852
1853 return res;
1854}
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1872{
1873 int error = 0;
1874 struct file *file = priv;
1875 struct eventpoll *ep = file->private_data;
1876 struct eventpoll *ep_tovisit;
1877 struct rb_node *rbp;
1878 struct epitem *epi;
1879
1880 mutex_lock_nested(&ep->mtx, call_nests + 1);
1881 ep->visited = 1;
1882 list_add(&ep->visited_list_link, &visited_list);
1883 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1884 epi = rb_entry(rbp, struct epitem, rbn);
1885 if (unlikely(is_file_epoll(epi->ffd.file))) {
1886 ep_tovisit = epi->ffd.file->private_data;
1887 if (ep_tovisit->visited)
1888 continue;
1889 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1890 ep_loop_check_proc, epi->ffd.file,
1891 ep_tovisit, current);
1892 if (error != 0)
1893 break;
1894 } else {
1895
1896
1897
1898
1899
1900
1901
1902
1903 if (list_empty(&epi->ffd.file->f_tfile_llink))
1904 list_add(&epi->ffd.file->f_tfile_llink,
1905 &tfile_check_list);
1906 }
1907 }
1908 mutex_unlock(&ep->mtx);
1909
1910 return error;
1911}
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924static int ep_loop_check(struct eventpoll *ep, struct file *file)
1925{
1926 int ret;
1927 struct eventpoll *ep_cur, *ep_next;
1928
1929 ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1930 ep_loop_check_proc, file, ep, current);
1931
1932 list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1933 visited_list_link) {
1934 ep_cur->visited = 0;
1935 list_del(&ep_cur->visited_list_link);
1936 }
1937 return ret;
1938}
1939
1940static void clear_tfile_check_list(void)
1941{
1942 struct file *file;
1943
1944
1945 while (!list_empty(&tfile_check_list)) {
1946 file = list_first_entry(&tfile_check_list, struct file,
1947 f_tfile_llink);
1948 list_del_init(&file->f_tfile_llink);
1949 }
1950 INIT_LIST_HEAD(&tfile_check_list);
1951}
1952
1953
1954
1955
1956SYSCALL_DEFINE1(epoll_create1, int, flags)
1957{
1958 int error, fd;
1959 struct eventpoll *ep = NULL;
1960 struct file *file;
1961
1962
1963 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1964
1965 if (flags & ~EPOLL_CLOEXEC)
1966 return -EINVAL;
1967
1968
1969
1970 error = ep_alloc(&ep);
1971 if (error < 0)
1972 return error;
1973
1974
1975
1976
1977 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
1978 if (fd < 0) {
1979 error = fd;
1980 goto out_free_ep;
1981 }
1982 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1983 O_RDWR | (flags & O_CLOEXEC));
1984 if (IS_ERR(file)) {
1985 error = PTR_ERR(file);
1986 goto out_free_fd;
1987 }
1988 ep->file = file;
1989 fd_install(fd, file);
1990 return fd;
1991
1992out_free_fd:
1993 put_unused_fd(fd);
1994out_free_ep:
1995 ep_free(ep);
1996 return error;
1997}
1998
1999SYSCALL_DEFINE1(epoll_create, int, size)
2000{
2001 if (size <= 0)
2002 return -EINVAL;
2003
2004 return sys_epoll_create1(0);
2005}
2006
2007
2008
2009
2010
2011
2012SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
2013 struct epoll_event __user *, event)
2014{
2015 int error;
2016 int full_check = 0;
2017 struct fd f, tf;
2018 struct eventpoll *ep;
2019 struct epitem *epi;
2020 struct epoll_event epds;
2021 struct eventpoll *tep = NULL;
2022
2023 error = -EFAULT;
2024 if (ep_op_has_event(op) &&
2025 copy_from_user(&epds, event, sizeof(struct epoll_event)))
2026 goto error_return;
2027
2028 error = -EBADF;
2029 f = fdget(epfd);
2030 if (!f.file)
2031 goto error_return;
2032
2033
2034 tf = fdget(fd);
2035 if (!tf.file)
2036 goto error_fput;
2037
2038
2039 error = -EPERM;
2040 if (!tf.file->f_op->poll)
2041 goto error_tgt_fput;
2042
2043
2044 if (ep_op_has_event(op))
2045 ep_take_care_of_epollwakeup(&epds);
2046
2047
2048
2049
2050
2051
2052 error = -EINVAL;
2053 if (f.file == tf.file || !is_file_epoll(f.file))
2054 goto error_tgt_fput;
2055
2056
2057
2058
2059
2060
2061 if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
2062 if (op == EPOLL_CTL_MOD)
2063 goto error_tgt_fput;
2064 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
2065 (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
2066 goto error_tgt_fput;
2067 }
2068
2069
2070
2071
2072
2073 ep = f.file->private_data;
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090 mutex_lock_nested(&ep->mtx, 0);
2091 if (op == EPOLL_CTL_ADD) {
2092 if (!list_empty(&f.file->f_ep_links) ||
2093 is_file_epoll(tf.file)) {
2094 full_check = 1;
2095 mutex_unlock(&ep->mtx);
2096 mutex_lock(&epmutex);
2097 if (is_file_epoll(tf.file)) {
2098 error = -ELOOP;
2099 if (ep_loop_check(ep, tf.file) != 0) {
2100 clear_tfile_check_list();
2101 goto error_tgt_fput;
2102 }
2103 } else
2104 list_add(&tf.file->f_tfile_llink,
2105 &tfile_check_list);
2106 mutex_lock_nested(&ep->mtx, 0);
2107 if (is_file_epoll(tf.file)) {
2108 tep = tf.file->private_data;
2109 mutex_lock_nested(&tep->mtx, 1);
2110 }
2111 }
2112 }
2113
2114
2115
2116
2117
2118
2119 epi = ep_find(ep, tf.file, fd);
2120
2121 error = -EINVAL;
2122 switch (op) {
2123 case EPOLL_CTL_ADD:
2124 if (!epi) {
2125 epds.events |= POLLERR | POLLHUP;
2126 error = ep_insert(ep, &epds, tf.file, fd, full_check);
2127 } else
2128 error = -EEXIST;
2129 if (full_check)
2130 clear_tfile_check_list();
2131 break;
2132 case EPOLL_CTL_DEL:
2133 if (epi)
2134 error = ep_remove(ep, epi);
2135 else
2136 error = -ENOENT;
2137 break;
2138 case EPOLL_CTL_MOD:
2139 if (epi) {
2140 if (!(epi->event.events & EPOLLEXCLUSIVE)) {
2141 epds.events |= POLLERR | POLLHUP;
2142 error = ep_modify(ep, epi, &epds);
2143 }
2144 } else
2145 error = -ENOENT;
2146 break;
2147 }
2148 if (tep != NULL)
2149 mutex_unlock(&tep->mtx);
2150 mutex_unlock(&ep->mtx);
2151
2152error_tgt_fput:
2153 if (full_check)
2154 mutex_unlock(&epmutex);
2155
2156 fdput(tf);
2157error_fput:
2158 fdput(f);
2159error_return:
2160
2161 return error;
2162}
2163
2164
2165
2166
2167
2168SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
2169 int, maxevents, int, timeout)
2170{
2171 int error;
2172 struct fd f;
2173 struct eventpoll *ep;
2174
2175
2176 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
2177 return -EINVAL;
2178
2179
2180 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
2181 return -EFAULT;
2182
2183
2184 f = fdget(epfd);
2185 if (!f.file)
2186 return -EBADF;
2187
2188
2189
2190
2191
2192 error = -EINVAL;
2193 if (!is_file_epoll(f.file))
2194 goto error_fput;
2195
2196
2197
2198
2199
2200 ep = f.file->private_data;
2201
2202
2203 error = ep_poll(ep, events, maxevents, timeout);
2204
2205error_fput:
2206 fdput(f);
2207 return error;
2208}
2209
2210
2211
2212
2213
2214SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
2215 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
2216 size_t, sigsetsize)
2217{
2218 int error;
2219 sigset_t ksigmask, sigsaved;
2220
2221
2222
2223
2224
2225 if (sigmask) {
2226 if (sigsetsize != sizeof(sigset_t))
2227 return -EINVAL;
2228 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
2229 return -EFAULT;
2230 sigsaved = current->blocked;
2231 set_current_blocked(&ksigmask);
2232 }
2233
2234 error = sys_epoll_wait(epfd, events, maxevents, timeout);
2235
2236
2237
2238
2239
2240
2241
2242 if (sigmask) {
2243 if (error == -EINTR) {
2244 memcpy(¤t->saved_sigmask, &sigsaved,
2245 sizeof(sigsaved));
2246 set_restore_sigmask();
2247 } else
2248 set_current_blocked(&sigsaved);
2249 }
2250
2251 return error;
2252}
2253
2254#ifdef CONFIG_COMPAT
2255COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2256 struct epoll_event __user *, events,
2257 int, maxevents, int, timeout,
2258 const compat_sigset_t __user *, sigmask,
2259 compat_size_t, sigsetsize)
2260{
2261 long err;
2262 compat_sigset_t csigmask;
2263 sigset_t ksigmask, sigsaved;
2264
2265
2266
2267
2268
2269 if (sigmask) {
2270 if (sigsetsize != sizeof(compat_sigset_t))
2271 return -EINVAL;
2272 if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
2273 return -EFAULT;
2274 sigset_from_compat(&ksigmask, &csigmask);
2275 sigsaved = current->blocked;
2276 set_current_blocked(&ksigmask);
2277 }
2278
2279 err = sys_epoll_wait(epfd, events, maxevents, timeout);
2280
2281
2282
2283
2284
2285
2286
2287 if (sigmask) {
2288 if (err == -EINTR) {
2289 memcpy(¤t->saved_sigmask, &sigsaved,
2290 sizeof(sigsaved));
2291 set_restore_sigmask();
2292 } else
2293 set_current_blocked(&sigsaved);
2294 }
2295
2296 return err;
2297}
2298#endif
2299
2300static int __init eventpoll_init(void)
2301{
2302 struct sysinfo si;
2303
2304 si_meminfo(&si);
2305
2306
2307
2308 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
2309 EP_ITEM_COST;
2310 BUG_ON(max_user_watches < 0);
2311
2312
2313
2314
2315
2316 ep_nested_calls_init(&poll_loop_ncalls);
2317
2318
2319 ep_nested_calls_init(&poll_safewake_ncalls);
2320
2321
2322 ep_nested_calls_init(&poll_readywalk_ncalls);
2323
2324
2325
2326
2327
2328 BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
2329
2330
2331 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
2332 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
2333
2334
2335 pwq_cache = kmem_cache_create("eventpoll_pwq",
2336 sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
2337
2338 return 0;
2339}
2340fs_initcall(eventpoll_init);
2341