1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/signal.h>
20#include <linux/errno.h>
21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/poll.h>
24#include <linux/string.h>
25#include <linux/list.h>
26#include <linux/hash.h>
27#include <linux/spinlock.h>
28#include <linux/syscalls.h>
29#include <linux/rbtree.h>
30#include <linux/wait.h>
31#include <linux/eventpoll.h>
32#include <linux/mount.h>
33#include <linux/bitops.h>
34#include <linux/mutex.h>
35#include <linux/anon_inodes.h>
36#include <linux/device.h>
37#include <linux/freezer.h>
38#include <asm/uaccess.h>
39#include <asm/io.h>
40#include <asm/mman.h>
41#include <linux/atomic.h>
42#include <linux/proc_fs.h>
43#include <linux/seq_file.h>
44#include <linux/compat.h>
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
96
97
98#define EP_MAX_NESTS 4
99
100#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
101
102#define EP_UNACTIVE_PTR ((void *) -1L)
103
104#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
105
106struct epoll_filefd {
107 struct file *file;
108 int fd;
109} __packed;
110
111
112
113
114
115struct nested_call_node {
116 struct list_head llink;
117 void *cookie;
118 void *ctx;
119};
120
121
122
123
124
125struct nested_calls {
126 struct list_head tasks_call_list;
127 spinlock_t lock;
128};
129
130
131
132
133
134
135
136struct epitem {
137
138 struct rb_node rbn;
139
140
141 struct list_head rdllink;
142
143
144
145
146
147 struct epitem *next;
148
149
150 struct epoll_filefd ffd;
151
152
153 int nwait;
154
155
156 struct list_head pwqlist;
157
158
159 struct eventpoll *ep;
160
161
162 struct list_head fllink;
163
164
165 struct wakeup_source __rcu *ws;
166
167
168 struct epoll_event event;
169};
170
171
172
173
174
175
176struct eventpoll {
177
178 spinlock_t lock;
179
180
181
182
183
184
185
186 struct mutex mtx;
187
188
189 wait_queue_head_t wq;
190
191
192 wait_queue_head_t poll_wait;
193
194
195 struct list_head rdllist;
196
197
198 struct rb_root rbr;
199
200
201
202
203
204
205 struct epitem *ovflist;
206
207
208 struct wakeup_source *ws;
209
210
211 struct user_struct *user;
212
213 struct file *file;
214
215
216 int visited;
217 struct list_head visited_list_link;
218};
219
220
221struct eppoll_entry {
222
223 struct list_head llink;
224
225
226 struct epitem *base;
227
228
229
230
231
232 wait_queue_t wait;
233
234
235 wait_queue_head_t *whead;
236};
237
238
239struct ep_pqueue {
240 poll_table pt;
241 struct epitem *epi;
242};
243
244
245struct ep_send_events_data {
246 int maxevents;
247 struct epoll_event __user *events;
248};
249
250
251
252
253
254static long max_user_watches __read_mostly;
255
256
257
258
259static DEFINE_MUTEX(epmutex);
260
261
262static struct nested_calls poll_loop_ncalls;
263
264
265static struct nested_calls poll_safewake_ncalls;
266
267
268static struct nested_calls poll_readywalk_ncalls;
269
270
271static struct kmem_cache *epi_cache __read_mostly;
272
273
274static struct kmem_cache *pwq_cache __read_mostly;
275
276
277static LIST_HEAD(visited_list);
278
279
280
281
282
283static LIST_HEAD(tfile_check_list);
284
285#ifdef CONFIG_SYSCTL
286
287#include <linux/sysctl.h>
288
289static long zero;
290static long long_max = LONG_MAX;
291
292ctl_table epoll_table[] = {
293 {
294 .procname = "max_user_watches",
295 .data = &max_user_watches,
296 .maxlen = sizeof(max_user_watches),
297 .mode = 0644,
298 .proc_handler = proc_doulongvec_minmax,
299 .extra1 = &zero,
300 .extra2 = &long_max,
301 },
302 { }
303};
304#endif
305
306static const struct file_operations eventpoll_fops;
307
308static inline int is_file_epoll(struct file *f)
309{
310 return f->f_op == &eventpoll_fops;
311}
312
313
314static inline void ep_set_ffd(struct epoll_filefd *ffd,
315 struct file *file, int fd)
316{
317 ffd->file = file;
318 ffd->fd = fd;
319}
320
321
322static inline int ep_cmp_ffd(struct epoll_filefd *p1,
323 struct epoll_filefd *p2)
324{
325 return (p1->file > p2->file ? +1:
326 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
327}
328
329
330static inline int ep_is_linked(struct list_head *p)
331{
332 return !list_empty(p);
333}
334
335static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
336{
337 return container_of(p, struct eppoll_entry, wait);
338}
339
340
341static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
342{
343 return container_of(p, struct eppoll_entry, wait)->base;
344}
345
346
347static inline struct epitem *ep_item_from_epqueue(poll_table *p)
348{
349 return container_of(p, struct ep_pqueue, pt)->epi;
350}
351
352
353static inline int ep_op_has_event(int op)
354{
355 return op != EPOLL_CTL_DEL;
356}
357
358
359static void ep_nested_calls_init(struct nested_calls *ncalls)
360{
361 INIT_LIST_HEAD(&ncalls->tasks_call_list);
362 spin_lock_init(&ncalls->lock);
363}
364
365
366
367
368
369
370
371
372
373static inline int ep_events_available(struct eventpoll *ep)
374{
375 return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
376}
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
395 int (*nproc)(void *, void *, int), void *priv,
396 void *cookie, void *ctx)
397{
398 int error, call_nests = 0;
399 unsigned long flags;
400 struct list_head *lsthead = &ncalls->tasks_call_list;
401 struct nested_call_node *tncur;
402 struct nested_call_node tnode;
403
404 spin_lock_irqsave(&ncalls->lock, flags);
405
406
407
408
409
410
411 list_for_each_entry(tncur, lsthead, llink) {
412 if (tncur->ctx == ctx &&
413 (tncur->cookie == cookie || ++call_nests > max_nests)) {
414
415
416
417
418 error = -1;
419 goto out_unlock;
420 }
421 }
422
423
424 tnode.ctx = ctx;
425 tnode.cookie = cookie;
426 list_add(&tnode.llink, lsthead);
427
428 spin_unlock_irqrestore(&ncalls->lock, flags);
429
430
431 error = (*nproc)(priv, cookie, call_nests);
432
433
434 spin_lock_irqsave(&ncalls->lock, flags);
435 list_del(&tnode.llink);
436out_unlock:
437 spin_unlock_irqrestore(&ncalls->lock, flags);
438
439 return error;
440}
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467#ifdef CONFIG_DEBUG_LOCK_ALLOC
468static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
469 unsigned long events, int subclass)
470{
471 unsigned long flags;
472
473 spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
474 wake_up_locked_poll(wqueue, events);
475 spin_unlock_irqrestore(&wqueue->lock, flags);
476}
477#else
478static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
479 unsigned long events, int subclass)
480{
481 wake_up_poll(wqueue, events);
482}
483#endif
484
485static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
486{
487 ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
488 1 + call_nests);
489 return 0;
490}
491
492
493
494
495
496
497
498
499
500
501
502static void ep_poll_safewake(wait_queue_head_t *wq)
503{
504 int this_cpu = get_cpu();
505
506 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
507 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
508
509 put_cpu();
510}
511
512static void ep_remove_wait_queue(struct eppoll_entry *pwq)
513{
514 wait_queue_head_t *whead;
515
516 rcu_read_lock();
517
518 whead = rcu_dereference(pwq->whead);
519 if (whead)
520 remove_wait_queue(whead, &pwq->wait);
521 rcu_read_unlock();
522}
523
524
525
526
527
528
529static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
530{
531 struct list_head *lsthead = &epi->pwqlist;
532 struct eppoll_entry *pwq;
533
534 while (!list_empty(lsthead)) {
535 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
536
537 list_del(&pwq->llink);
538 ep_remove_wait_queue(pwq);
539 kmem_cache_free(pwq_cache, pwq);
540 }
541}
542
543
544static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
545{
546 return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
547}
548
549
550static inline void ep_pm_stay_awake(struct epitem *epi)
551{
552 struct wakeup_source *ws = ep_wakeup_source(epi);
553
554 if (ws)
555 __pm_stay_awake(ws);
556}
557
558static inline bool ep_has_wakeup_source(struct epitem *epi)
559{
560 return rcu_access_pointer(epi->ws) ? true : false;
561}
562
563
564static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
565{
566 struct wakeup_source *ws;
567
568 rcu_read_lock();
569 ws = rcu_dereference(epi->ws);
570 if (ws)
571 __pm_stay_awake(ws);
572 rcu_read_unlock();
573}
574
575
576
577
578
579
580
581
582
583
584
585
586
587static int ep_scan_ready_list(struct eventpoll *ep,
588 int (*sproc)(struct eventpoll *,
589 struct list_head *, void *),
590 void *priv,
591 int depth)
592{
593 int error, pwake = 0;
594 unsigned long flags;
595 struct epitem *epi, *nepi;
596 LIST_HEAD(txlist);
597
598
599
600
601
602 mutex_lock_nested(&ep->mtx, depth);
603
604
605
606
607
608
609
610
611
612 spin_lock_irqsave(&ep->lock, flags);
613 list_splice_init(&ep->rdllist, &txlist);
614 ep->ovflist = NULL;
615 spin_unlock_irqrestore(&ep->lock, flags);
616
617
618
619
620 error = (*sproc)(ep, &txlist, priv);
621
622 spin_lock_irqsave(&ep->lock, flags);
623
624
625
626
627
628 for (nepi = ep->ovflist; (epi = nepi) != NULL;
629 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
630
631
632
633
634
635
636 if (!ep_is_linked(&epi->rdllink)) {
637 list_add_tail(&epi->rdllink, &ep->rdllist);
638 ep_pm_stay_awake(epi);
639 }
640 }
641
642
643
644
645
646 ep->ovflist = EP_UNACTIVE_PTR;
647
648
649
650
651 list_splice(&txlist, &ep->rdllist);
652 __pm_relax(ep->ws);
653
654 if (!list_empty(&ep->rdllist)) {
655
656
657
658
659 if (waitqueue_active(&ep->wq))
660 wake_up_locked(&ep->wq);
661 if (waitqueue_active(&ep->poll_wait))
662 pwake++;
663 }
664 spin_unlock_irqrestore(&ep->lock, flags);
665
666 mutex_unlock(&ep->mtx);
667
668
669 if (pwake)
670 ep_poll_safewake(&ep->poll_wait);
671
672 return error;
673}
674
675
676
677
678
679static int ep_remove(struct eventpoll *ep, struct epitem *epi)
680{
681 unsigned long flags;
682 struct file *file = epi->ffd.file;
683
684
685
686
687
688
689
690
691
692 ep_unregister_pollwait(ep, epi);
693
694
695 spin_lock(&file->f_lock);
696 if (ep_is_linked(&epi->fllink))
697 list_del_init(&epi->fllink);
698 spin_unlock(&file->f_lock);
699
700 rb_erase(&epi->rbn, &ep->rbr);
701
702 spin_lock_irqsave(&ep->lock, flags);
703 if (ep_is_linked(&epi->rdllink))
704 list_del_init(&epi->rdllink);
705 spin_unlock_irqrestore(&ep->lock, flags);
706
707 wakeup_source_unregister(ep_wakeup_source(epi));
708
709
710 kmem_cache_free(epi_cache, epi);
711
712 atomic_long_dec(&ep->user->epoll_watches);
713
714 return 0;
715}
716
717static void ep_free(struct eventpoll *ep)
718{
719 struct rb_node *rbp;
720 struct epitem *epi;
721
722
723 if (waitqueue_active(&ep->poll_wait))
724 ep_poll_safewake(&ep->poll_wait);
725
726
727
728
729
730
731
732
733
734 mutex_lock(&epmutex);
735
736
737
738
739 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
740 epi = rb_entry(rbp, struct epitem, rbn);
741
742 ep_unregister_pollwait(ep, epi);
743 }
744
745
746
747
748
749
750
751
752
753 mutex_lock(&ep->mtx);
754 while ((rbp = rb_first(&ep->rbr)) != NULL) {
755 epi = rb_entry(rbp, struct epitem, rbn);
756 ep_remove(ep, epi);
757 }
758 mutex_unlock(&ep->mtx);
759
760 mutex_unlock(&epmutex);
761 mutex_destroy(&ep->mtx);
762 free_uid(ep->user);
763 wakeup_source_unregister(ep->ws);
764 kfree(ep);
765}
766
767static int ep_eventpoll_release(struct inode *inode, struct file *file)
768{
769 struct eventpoll *ep = file->private_data;
770
771 if (ep)
772 ep_free(ep);
773
774 return 0;
775}
776
777static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
778{
779 pt->_key = epi->event.events;
780
781 return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
782}
783
784static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
785 void *priv)
786{
787 struct epitem *epi, *tmp;
788 poll_table pt;
789
790 init_poll_funcptr(&pt, NULL);
791
792 list_for_each_entry_safe(epi, tmp, head, rdllink) {
793 if (ep_item_poll(epi, &pt))
794 return POLLIN | POLLRDNORM;
795 else {
796
797
798
799
800
801 __pm_relax(ep_wakeup_source(epi));
802 list_del_init(&epi->rdllink);
803 }
804 }
805
806 return 0;
807}
808
809static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
810{
811 return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
812}
813
814static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
815{
816 int pollflags;
817 struct eventpoll *ep = file->private_data;
818
819
820 poll_wait(file, &ep->poll_wait, wait);
821
822
823
824
825
826
827
828 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
829 ep_poll_readyevents_proc, ep, ep, current);
830
831 return pollflags != -1 ? pollflags : 0;
832}
833
834#ifdef CONFIG_PROC_FS
835static int ep_show_fdinfo(struct seq_file *m, struct file *f)
836{
837 struct eventpoll *ep = f->private_data;
838 struct rb_node *rbp;
839 int ret = 0;
840
841 mutex_lock(&ep->mtx);
842 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
843 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
844
845 ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
846 epi->ffd.fd, epi->event.events,
847 (long long)epi->event.data);
848 if (ret)
849 break;
850 }
851 mutex_unlock(&ep->mtx);
852
853 return ret;
854}
855#endif
856
857
858static const struct file_operations eventpoll_fops = {
859#ifdef CONFIG_PROC_FS
860 .show_fdinfo = ep_show_fdinfo,
861#endif
862 .release = ep_eventpoll_release,
863 .poll = ep_eventpoll_poll,
864 .llseek = noop_llseek,
865};
866
867
868
869
870
871
872void eventpoll_release_file(struct file *file)
873{
874 struct list_head *lsthead = &file->f_ep_links;
875 struct eventpoll *ep;
876 struct epitem *epi;
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891 mutex_lock(&epmutex);
892
893 while (!list_empty(lsthead)) {
894 epi = list_first_entry(lsthead, struct epitem, fllink);
895
896 ep = epi->ep;
897 list_del_init(&epi->fllink);
898 mutex_lock_nested(&ep->mtx, 0);
899 ep_remove(ep, epi);
900 mutex_unlock(&ep->mtx);
901 }
902
903 mutex_unlock(&epmutex);
904}
905
906static int ep_alloc(struct eventpoll **pep)
907{
908 int error;
909 struct user_struct *user;
910 struct eventpoll *ep;
911
912 user = get_current_user();
913 error = -ENOMEM;
914 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
915 if (unlikely(!ep))
916 goto free_uid;
917
918 spin_lock_init(&ep->lock);
919 mutex_init(&ep->mtx);
920 init_waitqueue_head(&ep->wq);
921 init_waitqueue_head(&ep->poll_wait);
922 INIT_LIST_HEAD(&ep->rdllist);
923 ep->rbr = RB_ROOT;
924 ep->ovflist = EP_UNACTIVE_PTR;
925 ep->user = user;
926
927 *pep = ep;
928
929 return 0;
930
931free_uid:
932 free_uid(user);
933 return error;
934}
935
936
937
938
939
940
941static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
942{
943 int kcmp;
944 struct rb_node *rbp;
945 struct epitem *epi, *epir = NULL;
946 struct epoll_filefd ffd;
947
948 ep_set_ffd(&ffd, file, fd);
949 for (rbp = ep->rbr.rb_node; rbp; ) {
950 epi = rb_entry(rbp, struct epitem, rbn);
951 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
952 if (kcmp > 0)
953 rbp = rbp->rb_right;
954 else if (kcmp < 0)
955 rbp = rbp->rb_left;
956 else {
957 epir = epi;
958 break;
959 }
960 }
961
962 return epir;
963}
964
965
966
967
968
969
970static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
971{
972 int pwake = 0;
973 unsigned long flags;
974 struct epitem *epi = ep_item_from_wait(wait);
975 struct eventpoll *ep = epi->ep;
976
977 if ((unsigned long)key & POLLFREE) {
978 ep_pwq_from_wait(wait)->whead = NULL;
979
980
981
982
983
984
985 list_del_init(&wait->task_list);
986 }
987
988 spin_lock_irqsave(&ep->lock, flags);
989
990
991
992
993
994
995
996 if (!(epi->event.events & ~EP_PRIVATE_BITS))
997 goto out_unlock;
998
999
1000
1001
1002
1003
1004
1005 if (key && !((unsigned long) key & epi->event.events))
1006 goto out_unlock;
1007
1008
1009
1010
1011
1012
1013
1014 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
1015 if (epi->next == EP_UNACTIVE_PTR) {
1016 epi->next = ep->ovflist;
1017 ep->ovflist = epi;
1018 if (epi->ws) {
1019
1020
1021
1022
1023 __pm_stay_awake(ep->ws);
1024 }
1025
1026 }
1027 goto out_unlock;
1028 }
1029
1030
1031 if (!ep_is_linked(&epi->rdllink)) {
1032 list_add_tail(&epi->rdllink, &ep->rdllist);
1033 ep_pm_stay_awake_rcu(epi);
1034 }
1035
1036
1037
1038
1039
1040 if (waitqueue_active(&ep->wq))
1041 wake_up_locked(&ep->wq);
1042 if (waitqueue_active(&ep->poll_wait))
1043 pwake++;
1044
1045out_unlock:
1046 spin_unlock_irqrestore(&ep->lock, flags);
1047
1048
1049 if (pwake)
1050 ep_poll_safewake(&ep->poll_wait);
1051
1052 return 1;
1053}
1054
1055
1056
1057
1058
1059static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
1060 poll_table *pt)
1061{
1062 struct epitem *epi = ep_item_from_epqueue(pt);
1063 struct eppoll_entry *pwq;
1064
1065 if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
1066 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
1067 pwq->whead = whead;
1068 pwq->base = epi;
1069 add_wait_queue(whead, &pwq->wait);
1070 list_add_tail(&pwq->llink, &epi->pwqlist);
1071 epi->nwait++;
1072 } else {
1073
1074 epi->nwait = -1;
1075 }
1076}
1077
1078static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1079{
1080 int kcmp;
1081 struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
1082 struct epitem *epic;
1083
1084 while (*p) {
1085 parent = *p;
1086 epic = rb_entry(parent, struct epitem, rbn);
1087 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1088 if (kcmp > 0)
1089 p = &parent->rb_right;
1090 else
1091 p = &parent->rb_left;
1092 }
1093 rb_link_node(&epi->rbn, parent, p);
1094 rb_insert_color(&epi->rbn, &ep->rbr);
1095}
1096
1097
1098
1099#define PATH_ARR_SIZE 5
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
1112static int path_count[PATH_ARR_SIZE];
1113
1114static int path_count_inc(int nests)
1115{
1116
1117 if (nests == 0)
1118 return 0;
1119
1120 if (++path_count[nests] > path_limits[nests])
1121 return -1;
1122 return 0;
1123}
1124
1125static void path_count_init(void)
1126{
1127 int i;
1128
1129 for (i = 0; i < PATH_ARR_SIZE; i++)
1130 path_count[i] = 0;
1131}
1132
1133static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1134{
1135 int error = 0;
1136 struct file *file = priv;
1137 struct file *child_file;
1138 struct epitem *epi;
1139
1140 list_for_each_entry(epi, &file->f_ep_links, fllink) {
1141 child_file = epi->ep->file;
1142 if (is_file_epoll(child_file)) {
1143 if (list_empty(&child_file->f_ep_links)) {
1144 if (path_count_inc(call_nests)) {
1145 error = -1;
1146 break;
1147 }
1148 } else {
1149 error = ep_call_nested(&poll_loop_ncalls,
1150 EP_MAX_NESTS,
1151 reverse_path_check_proc,
1152 child_file, child_file,
1153 current);
1154 }
1155 if (error != 0)
1156 break;
1157 } else {
1158 printk(KERN_ERR "reverse_path_check_proc: "
1159 "file is not an ep!\n");
1160 }
1161 }
1162 return error;
1163}
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175static int reverse_path_check(void)
1176{
1177 int error = 0;
1178 struct file *current_file;
1179
1180
1181 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1182 path_count_init();
1183 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1184 reverse_path_check_proc, current_file,
1185 current_file, current);
1186 if (error)
1187 break;
1188 }
1189 return error;
1190}
1191
1192static int ep_create_wakeup_source(struct epitem *epi)
1193{
1194 const char *name;
1195 struct wakeup_source *ws;
1196
1197 if (!epi->ep->ws) {
1198 epi->ep->ws = wakeup_source_register("eventpoll");
1199 if (!epi->ep->ws)
1200 return -ENOMEM;
1201 }
1202
1203 name = epi->ffd.file->f_path.dentry->d_name.name;
1204 ws = wakeup_source_register(name);
1205
1206 if (!ws)
1207 return -ENOMEM;
1208 rcu_assign_pointer(epi->ws, ws);
1209
1210 return 0;
1211}
1212
1213
1214static noinline void ep_destroy_wakeup_source(struct epitem *epi)
1215{
1216 struct wakeup_source *ws = ep_wakeup_source(epi);
1217
1218 RCU_INIT_POINTER(epi->ws, NULL);
1219
1220
1221
1222
1223
1224
1225 synchronize_rcu();
1226 wakeup_source_unregister(ws);
1227}
1228
1229
1230
1231
1232static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1233 struct file *tfile, int fd)
1234{
1235 int error, revents, pwake = 0;
1236 unsigned long flags;
1237 long user_watches;
1238 struct epitem *epi;
1239 struct ep_pqueue epq;
1240
1241 user_watches = atomic_long_read(&ep->user->epoll_watches);
1242 if (unlikely(user_watches >= max_user_watches))
1243 return -ENOSPC;
1244 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
1245 return -ENOMEM;
1246
1247
1248 INIT_LIST_HEAD(&epi->rdllink);
1249 INIT_LIST_HEAD(&epi->fllink);
1250 INIT_LIST_HEAD(&epi->pwqlist);
1251 epi->ep = ep;
1252 ep_set_ffd(&epi->ffd, tfile, fd);
1253 epi->event = *event;
1254 epi->nwait = 0;
1255 epi->next = EP_UNACTIVE_PTR;
1256 if (epi->event.events & EPOLLWAKEUP) {
1257 error = ep_create_wakeup_source(epi);
1258 if (error)
1259 goto error_create_wakeup_source;
1260 } else {
1261 RCU_INIT_POINTER(epi->ws, NULL);
1262 }
1263
1264
1265 epq.epi = epi;
1266 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1267
1268
1269
1270
1271
1272
1273
1274
1275 revents = ep_item_poll(epi, &epq.pt);
1276
1277
1278
1279
1280
1281
1282 error = -ENOMEM;
1283 if (epi->nwait < 0)
1284 goto error_unregister;
1285
1286
1287 spin_lock(&tfile->f_lock);
1288 list_add_tail(&epi->fllink, &tfile->f_ep_links);
1289 spin_unlock(&tfile->f_lock);
1290
1291
1292
1293
1294
1295 ep_rbtree_insert(ep, epi);
1296
1297
1298 error = -EINVAL;
1299 if (reverse_path_check())
1300 goto error_remove_epi;
1301
1302
1303 spin_lock_irqsave(&ep->lock, flags);
1304
1305
1306 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
1307 list_add_tail(&epi->rdllink, &ep->rdllist);
1308 ep_pm_stay_awake(epi);
1309
1310
1311 if (waitqueue_active(&ep->wq))
1312 wake_up_locked(&ep->wq);
1313 if (waitqueue_active(&ep->poll_wait))
1314 pwake++;
1315 }
1316
1317 spin_unlock_irqrestore(&ep->lock, flags);
1318
1319 atomic_long_inc(&ep->user->epoll_watches);
1320
1321
1322 if (pwake)
1323 ep_poll_safewake(&ep->poll_wait);
1324
1325 return 0;
1326
1327error_remove_epi:
1328 spin_lock(&tfile->f_lock);
1329 if (ep_is_linked(&epi->fllink))
1330 list_del_init(&epi->fllink);
1331 spin_unlock(&tfile->f_lock);
1332
1333 rb_erase(&epi->rbn, &ep->rbr);
1334
1335error_unregister:
1336 ep_unregister_pollwait(ep, epi);
1337
1338
1339
1340
1341
1342
1343
1344 spin_lock_irqsave(&ep->lock, flags);
1345 if (ep_is_linked(&epi->rdllink))
1346 list_del_init(&epi->rdllink);
1347 spin_unlock_irqrestore(&ep->lock, flags);
1348
1349 wakeup_source_unregister(ep_wakeup_source(epi));
1350
1351error_create_wakeup_source:
1352 kmem_cache_free(epi_cache, epi);
1353
1354 return error;
1355}
1356
1357
1358
1359
1360
1361static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
1362{
1363 int pwake = 0;
1364 unsigned int revents;
1365 poll_table pt;
1366
1367 init_poll_funcptr(&pt, NULL);
1368
1369
1370
1371
1372
1373
1374 epi->event.events = event->events;
1375 epi->event.data = event->data;
1376 if (epi->event.events & EPOLLWAKEUP) {
1377 if (!ep_has_wakeup_source(epi))
1378 ep_create_wakeup_source(epi);
1379 } else if (ep_has_wakeup_source(epi)) {
1380 ep_destroy_wakeup_source(epi);
1381 }
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401 smp_mb();
1402
1403
1404
1405
1406
1407 revents = ep_item_poll(epi, &pt);
1408
1409
1410
1411
1412
1413 if (revents & event->events) {
1414 spin_lock_irq(&ep->lock);
1415 if (!ep_is_linked(&epi->rdllink)) {
1416 list_add_tail(&epi->rdllink, &ep->rdllist);
1417 ep_pm_stay_awake(epi);
1418
1419
1420 if (waitqueue_active(&ep->wq))
1421 wake_up_locked(&ep->wq);
1422 if (waitqueue_active(&ep->poll_wait))
1423 pwake++;
1424 }
1425 spin_unlock_irq(&ep->lock);
1426 }
1427
1428
1429 if (pwake)
1430 ep_poll_safewake(&ep->poll_wait);
1431
1432 return 0;
1433}
1434
1435static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1436 void *priv)
1437{
1438 struct ep_send_events_data *esed = priv;
1439 int eventcnt;
1440 unsigned int revents;
1441 struct epitem *epi;
1442 struct epoll_event __user *uevent;
1443 struct wakeup_source *ws;
1444 poll_table pt;
1445
1446 init_poll_funcptr(&pt, NULL);
1447
1448
1449
1450
1451
1452
1453 for (eventcnt = 0, uevent = esed->events;
1454 !list_empty(head) && eventcnt < esed->maxevents;) {
1455 epi = list_first_entry(head, struct epitem, rdllink);
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466 ws = ep_wakeup_source(epi);
1467 if (ws) {
1468 if (ws->active)
1469 __pm_stay_awake(ep->ws);
1470 __pm_relax(ws);
1471 }
1472
1473 list_del_init(&epi->rdllink);
1474
1475 revents = ep_item_poll(epi, &pt);
1476
1477
1478
1479
1480
1481
1482
1483 if (revents) {
1484 if (__put_user(revents, &uevent->events) ||
1485 __put_user(epi->event.data, &uevent->data)) {
1486 list_add(&epi->rdllink, head);
1487 ep_pm_stay_awake(epi);
1488 return eventcnt ? eventcnt : -EFAULT;
1489 }
1490 eventcnt++;
1491 uevent++;
1492 if (epi->event.events & EPOLLONESHOT)
1493 epi->event.events &= EP_PRIVATE_BITS;
1494 else if (!(epi->event.events & EPOLLET)) {
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506 list_add_tail(&epi->rdllink, &ep->rdllist);
1507 ep_pm_stay_awake(epi);
1508 }
1509 }
1510 }
1511
1512 return eventcnt;
1513}
1514
1515static int ep_send_events(struct eventpoll *ep,
1516 struct epoll_event __user *events, int maxevents)
1517{
1518 struct ep_send_events_data esed;
1519
1520 esed.maxevents = maxevents;
1521 esed.events = events;
1522
1523 return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
1524}
1525
1526static inline struct timespec ep_set_mstimeout(long ms)
1527{
1528 struct timespec now, ts = {
1529 .tv_sec = ms / MSEC_PER_SEC,
1530 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1531 };
1532
1533 ktime_get_ts(&now);
1534 return timespec_add_safe(now, ts);
1535}
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1555 int maxevents, long timeout)
1556{
1557 int res = 0, eavail, timed_out = 0;
1558 unsigned long flags;
1559 long slack = 0;
1560 wait_queue_t wait;
1561 ktime_t expires, *to = NULL;
1562
1563 if (timeout > 0) {
1564 struct timespec end_time = ep_set_mstimeout(timeout);
1565
1566 slack = select_estimate_accuracy(&end_time);
1567 to = &expires;
1568 *to = timespec_to_ktime(end_time);
1569 } else if (timeout == 0) {
1570
1571
1572
1573
1574 timed_out = 1;
1575 spin_lock_irqsave(&ep->lock, flags);
1576 goto check_events;
1577 }
1578
1579fetch_events:
1580 spin_lock_irqsave(&ep->lock, flags);
1581
1582 if (!ep_events_available(ep)) {
1583
1584
1585
1586
1587
1588 init_waitqueue_entry(&wait, current);
1589 __add_wait_queue_exclusive(&ep->wq, &wait);
1590
1591 for (;;) {
1592
1593
1594
1595
1596
1597 set_current_state(TASK_INTERRUPTIBLE);
1598 if (ep_events_available(ep) || timed_out)
1599 break;
1600 if (signal_pending(current)) {
1601 res = -EINTR;
1602 break;
1603 }
1604
1605 spin_unlock_irqrestore(&ep->lock, flags);
1606 if (!freezable_schedule_hrtimeout_range(to, slack,
1607 HRTIMER_MODE_ABS))
1608 timed_out = 1;
1609
1610 spin_lock_irqsave(&ep->lock, flags);
1611 }
1612 __remove_wait_queue(&ep->wq, &wait);
1613
1614 set_current_state(TASK_RUNNING);
1615 }
1616check_events:
1617
1618 eavail = ep_events_available(ep);
1619
1620 spin_unlock_irqrestore(&ep->lock, flags);
1621
1622
1623
1624
1625
1626
1627 if (!res && eavail &&
1628 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1629 goto fetch_events;
1630
1631 return res;
1632}
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1650{
1651 int error = 0;
1652 struct file *file = priv;
1653 struct eventpoll *ep = file->private_data;
1654 struct eventpoll *ep_tovisit;
1655 struct rb_node *rbp;
1656 struct epitem *epi;
1657
1658 mutex_lock_nested(&ep->mtx, call_nests + 1);
1659 ep->visited = 1;
1660 list_add(&ep->visited_list_link, &visited_list);
1661 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1662 epi = rb_entry(rbp, struct epitem, rbn);
1663 if (unlikely(is_file_epoll(epi->ffd.file))) {
1664 ep_tovisit = epi->ffd.file->private_data;
1665 if (ep_tovisit->visited)
1666 continue;
1667 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1668 ep_loop_check_proc, epi->ffd.file,
1669 ep_tovisit, current);
1670 if (error != 0)
1671 break;
1672 } else {
1673
1674
1675
1676
1677
1678
1679
1680
1681 if (list_empty(&epi->ffd.file->f_tfile_llink))
1682 list_add(&epi->ffd.file->f_tfile_llink,
1683 &tfile_check_list);
1684 }
1685 }
1686 mutex_unlock(&ep->mtx);
1687
1688 return error;
1689}
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702static int ep_loop_check(struct eventpoll *ep, struct file *file)
1703{
1704 int ret;
1705 struct eventpoll *ep_cur, *ep_next;
1706
1707 ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1708 ep_loop_check_proc, file, ep, current);
1709
1710 list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1711 visited_list_link) {
1712 ep_cur->visited = 0;
1713 list_del(&ep_cur->visited_list_link);
1714 }
1715 return ret;
1716}
1717
1718static void clear_tfile_check_list(void)
1719{
1720 struct file *file;
1721
1722
1723 while (!list_empty(&tfile_check_list)) {
1724 file = list_first_entry(&tfile_check_list, struct file,
1725 f_tfile_llink);
1726 list_del_init(&file->f_tfile_llink);
1727 }
1728 INIT_LIST_HEAD(&tfile_check_list);
1729}
1730
1731
1732
1733
1734SYSCALL_DEFINE1(epoll_create1, int, flags)
1735{
1736 int error, fd;
1737 struct eventpoll *ep = NULL;
1738 struct file *file;
1739
1740
1741 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1742
1743 if (flags & ~EPOLL_CLOEXEC)
1744 return -EINVAL;
1745
1746
1747
1748 error = ep_alloc(&ep);
1749 if (error < 0)
1750 return error;
1751
1752
1753
1754
1755 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
1756 if (fd < 0) {
1757 error = fd;
1758 goto out_free_ep;
1759 }
1760 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1761 O_RDWR | (flags & O_CLOEXEC));
1762 if (IS_ERR(file)) {
1763 error = PTR_ERR(file);
1764 goto out_free_fd;
1765 }
1766 ep->file = file;
1767 fd_install(fd, file);
1768 return fd;
1769
1770out_free_fd:
1771 put_unused_fd(fd);
1772out_free_ep:
1773 ep_free(ep);
1774 return error;
1775}
1776
1777SYSCALL_DEFINE1(epoll_create, int, size)
1778{
1779 if (size <= 0)
1780 return -EINVAL;
1781
1782 return sys_epoll_create1(0);
1783}
1784
1785
1786
1787
1788
1789
1790SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1791 struct epoll_event __user *, event)
1792{
1793 int error;
1794 int did_lock_epmutex = 0;
1795 struct file *file, *tfile;
1796 struct eventpoll *ep;
1797 struct epitem *epi;
1798 struct epoll_event epds;
1799
1800 error = -EFAULT;
1801 if (ep_op_has_event(op) &&
1802 copy_from_user(&epds, event, sizeof(struct epoll_event)))
1803 goto error_return;
1804
1805
1806 error = -EBADF;
1807 file = fget(epfd);
1808 if (!file)
1809 goto error_return;
1810
1811
1812 tfile = fget(fd);
1813 if (!tfile)
1814 goto error_fput;
1815
1816
1817 error = -EPERM;
1818 if (!tfile->f_op || !tfile->f_op->poll)
1819 goto error_tgt_fput;
1820
1821
1822 if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
1823 epds.events &= ~EPOLLWAKEUP;
1824
1825
1826
1827
1828
1829
1830 error = -EINVAL;
1831 if (file == tfile || !is_file_epoll(file))
1832 goto error_tgt_fput;
1833
1834
1835
1836
1837
1838 ep = file->private_data;
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852 if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1853 mutex_lock(&epmutex);
1854 did_lock_epmutex = 1;
1855 }
1856 if (op == EPOLL_CTL_ADD) {
1857 if (is_file_epoll(tfile)) {
1858 error = -ELOOP;
1859 if (ep_loop_check(ep, tfile) != 0) {
1860 clear_tfile_check_list();
1861 goto error_tgt_fput;
1862 }
1863 } else
1864 list_add(&tfile->f_tfile_llink, &tfile_check_list);
1865 }
1866
1867 mutex_lock_nested(&ep->mtx, 0);
1868
1869
1870
1871
1872
1873
1874 epi = ep_find(ep, tfile, fd);
1875
1876 error = -EINVAL;
1877 switch (op) {
1878 case EPOLL_CTL_ADD:
1879 if (!epi) {
1880 epds.events |= POLLERR | POLLHUP;
1881 error = ep_insert(ep, &epds, tfile, fd);
1882 } else
1883 error = -EEXIST;
1884 clear_tfile_check_list();
1885 break;
1886 case EPOLL_CTL_DEL:
1887 if (epi)
1888 error = ep_remove(ep, epi);
1889 else
1890 error = -ENOENT;
1891 break;
1892 case EPOLL_CTL_MOD:
1893 if (epi) {
1894 epds.events |= POLLERR | POLLHUP;
1895 error = ep_modify(ep, epi, &epds);
1896 } else
1897 error = -ENOENT;
1898 break;
1899 }
1900 mutex_unlock(&ep->mtx);
1901
1902error_tgt_fput:
1903 if (did_lock_epmutex)
1904 mutex_unlock(&epmutex);
1905
1906 fput(tfile);
1907error_fput:
1908 fput(file);
1909error_return:
1910
1911 return error;
1912}
1913
1914
1915
1916
1917
1918SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1919 int, maxevents, int, timeout)
1920{
1921 int error;
1922 struct fd f;
1923 struct eventpoll *ep;
1924
1925
1926 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1927 return -EINVAL;
1928
1929
1930 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
1931 return -EFAULT;
1932
1933
1934 f = fdget(epfd);
1935 if (!f.file)
1936 return -EBADF;
1937
1938
1939
1940
1941
1942 error = -EINVAL;
1943 if (!is_file_epoll(f.file))
1944 goto error_fput;
1945
1946
1947
1948
1949
1950 ep = f.file->private_data;
1951
1952
1953 error = ep_poll(ep, events, maxevents, timeout);
1954
1955error_fput:
1956 fdput(f);
1957 return error;
1958}
1959
1960
1961
1962
1963
1964SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1965 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
1966 size_t, sigsetsize)
1967{
1968 int error;
1969 sigset_t ksigmask, sigsaved;
1970
1971
1972
1973
1974
1975 if (sigmask) {
1976 if (sigsetsize != sizeof(sigset_t))
1977 return -EINVAL;
1978 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1979 return -EFAULT;
1980 sigsaved = current->blocked;
1981 set_current_blocked(&ksigmask);
1982 }
1983
1984 error = sys_epoll_wait(epfd, events, maxevents, timeout);
1985
1986
1987
1988
1989
1990
1991
1992 if (sigmask) {
1993 if (error == -EINTR) {
1994 memcpy(¤t->saved_sigmask, &sigsaved,
1995 sizeof(sigsaved));
1996 set_restore_sigmask();
1997 } else
1998 set_current_blocked(&sigsaved);
1999 }
2000
2001 return error;
2002}
2003
2004#ifdef CONFIG_COMPAT
2005COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2006 struct epoll_event __user *, events,
2007 int, maxevents, int, timeout,
2008 const compat_sigset_t __user *, sigmask,
2009 compat_size_t, sigsetsize)
2010{
2011 long err;
2012 compat_sigset_t csigmask;
2013 sigset_t ksigmask, sigsaved;
2014
2015
2016
2017
2018
2019 if (sigmask) {
2020 if (sigsetsize != sizeof(compat_sigset_t))
2021 return -EINVAL;
2022 if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
2023 return -EFAULT;
2024 sigset_from_compat(&ksigmask, &csigmask);
2025 sigsaved = current->blocked;
2026 set_current_blocked(&ksigmask);
2027 }
2028
2029 err = sys_epoll_wait(epfd, events, maxevents, timeout);
2030
2031
2032
2033
2034
2035
2036
2037 if (sigmask) {
2038 if (err == -EINTR) {
2039 memcpy(¤t->saved_sigmask, &sigsaved,
2040 sizeof(sigsaved));
2041 set_restore_sigmask();
2042 } else
2043 set_current_blocked(&sigsaved);
2044 }
2045
2046 return err;
2047}
2048#endif
2049
2050static int __init eventpoll_init(void)
2051{
2052 struct sysinfo si;
2053
2054 si_meminfo(&si);
2055
2056
2057
2058 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
2059 EP_ITEM_COST;
2060 BUG_ON(max_user_watches < 0);
2061
2062
2063
2064
2065
2066 ep_nested_calls_init(&poll_loop_ncalls);
2067
2068
2069 ep_nested_calls_init(&poll_safewake_ncalls);
2070
2071
2072 ep_nested_calls_init(&poll_readywalk_ncalls);
2073
2074
2075
2076
2077
2078 BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
2079
2080
2081 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
2082 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
2083
2084
2085 pwq_cache = kmem_cache_create("eventpoll_pwq",
2086 sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
2087
2088 return 0;
2089}
2090fs_initcall(eventpoll_init);
2091