1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/signal.h>
20#include <linux/errno.h>
21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/poll.h>
24#include <linux/string.h>
25#include <linux/list.h>
26#include <linux/hash.h>
27#include <linux/spinlock.h>
28#include <linux/syscalls.h>
29#include <linux/rbtree.h>
30#include <linux/wait.h>
31#include <linux/eventpoll.h>
32#include <linux/mount.h>
33#include <linux/bitops.h>
34#include <linux/mutex.h>
35#include <linux/anon_inodes.h>
36#include <asm/uaccess.h>
37#include <asm/system.h>
38#include <asm/io.h>
39#include <asm/mman.h>
40#include <asm/atomic.h>
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
83
84
85#define EP_MAX_NESTS 4
86
87#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
88
89#define EP_UNACTIVE_PTR ((void *) -1L)
90
91#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
92
93struct epoll_filefd {
94 struct file *file;
95 int fd;
96};
97
98
99
100
101
102struct nested_call_node {
103 struct list_head llink;
104 void *cookie;
105 void *ctx;
106};
107
108
109
110
111
112struct nested_calls {
113 struct list_head tasks_call_list;
114 spinlock_t lock;
115};
116
117
118
119
120
121struct epitem {
122
123 struct rb_node rbn;
124
125
126 struct list_head rdllink;
127
128
129
130
131
132 struct epitem *next;
133
134
135 struct epoll_filefd ffd;
136
137
138 int nwait;
139
140
141 struct list_head pwqlist;
142
143
144 struct eventpoll *ep;
145
146
147 struct list_head fllink;
148
149
150 struct epoll_event event;
151};
152
153
154
155
156
157
158struct eventpoll {
159
160 spinlock_t lock;
161
162
163
164
165
166
167
168 struct mutex mtx;
169
170
171 wait_queue_head_t wq;
172
173
174 wait_queue_head_t poll_wait;
175
176
177 struct list_head rdllist;
178
179
180 struct rb_root rbr;
181
182
183
184
185
186
187 struct epitem *ovflist;
188
189
190 struct user_struct *user;
191};
192
193
194struct eppoll_entry {
195
196 struct list_head llink;
197
198
199 struct epitem *base;
200
201
202
203
204
205 wait_queue_t wait;
206
207
208 wait_queue_head_t *whead;
209};
210
211
212struct ep_pqueue {
213 poll_table pt;
214 struct epitem *epi;
215};
216
217
218struct ep_send_events_data {
219 int maxevents;
220 struct epoll_event __user *events;
221};
222
223
224
225
226
227static long max_user_watches __read_mostly;
228
229
230
231
232static DEFINE_MUTEX(epmutex);
233
234
235static struct nested_calls poll_loop_ncalls;
236
237
238static struct nested_calls poll_safewake_ncalls;
239
240
241static struct nested_calls poll_readywalk_ncalls;
242
243
244static struct kmem_cache *epi_cache __read_mostly;
245
246
247static struct kmem_cache *pwq_cache __read_mostly;
248
249#ifdef CONFIG_SYSCTL
250
251#include <linux/sysctl.h>
252
253static long zero;
254static long long_max = LONG_MAX;
255
256ctl_table epoll_table[] = {
257 {
258 .procname = "max_user_watches",
259 .data = &max_user_watches,
260 .maxlen = sizeof(max_user_watches),
261 .mode = 0644,
262 .proc_handler = proc_doulongvec_minmax,
263 .extra1 = &zero,
264 .extra2 = &long_max,
265 },
266 { }
267};
268#endif
269
270
271
272static inline void ep_set_ffd(struct epoll_filefd *ffd,
273 struct file *file, int fd)
274{
275 ffd->file = file;
276 ffd->fd = fd;
277}
278
279
280static inline int ep_cmp_ffd(struct epoll_filefd *p1,
281 struct epoll_filefd *p2)
282{
283 return (p1->file > p2->file ? +1:
284 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
285}
286
287
288static inline int ep_is_linked(struct list_head *p)
289{
290 return !list_empty(p);
291}
292
293
294static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
295{
296 return container_of(p, struct eppoll_entry, wait)->base;
297}
298
299
300static inline struct epitem *ep_item_from_epqueue(poll_table *p)
301{
302 return container_of(p, struct ep_pqueue, pt)->epi;
303}
304
305
306static inline int ep_op_has_event(int op)
307{
308 return op != EPOLL_CTL_DEL;
309}
310
311
312static void ep_nested_calls_init(struct nested_calls *ncalls)
313{
314 INIT_LIST_HEAD(&ncalls->tasks_call_list);
315 spin_lock_init(&ncalls->lock);
316}
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
335 int (*nproc)(void *, void *, int), void *priv,
336 void *cookie, void *ctx)
337{
338 int error, call_nests = 0;
339 unsigned long flags;
340 struct list_head *lsthead = &ncalls->tasks_call_list;
341 struct nested_call_node *tncur;
342 struct nested_call_node tnode;
343
344 spin_lock_irqsave(&ncalls->lock, flags);
345
346
347
348
349
350
351 list_for_each_entry(tncur, lsthead, llink) {
352 if (tncur->ctx == ctx &&
353 (tncur->cookie == cookie || ++call_nests > max_nests)) {
354
355
356
357
358 error = -1;
359 goto out_unlock;
360 }
361 }
362
363
364 tnode.ctx = ctx;
365 tnode.cookie = cookie;
366 list_add(&tnode.llink, lsthead);
367
368 spin_unlock_irqrestore(&ncalls->lock, flags);
369
370
371 error = (*nproc)(priv, cookie, call_nests);
372
373
374 spin_lock_irqsave(&ncalls->lock, flags);
375 list_del(&tnode.llink);
376out_unlock:
377 spin_unlock_irqrestore(&ncalls->lock, flags);
378
379 return error;
380}
381
382#ifdef CONFIG_DEBUG_LOCK_ALLOC
383static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
384 unsigned long events, int subclass)
385{
386 unsigned long flags;
387
388 spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
389 wake_up_locked_poll(wqueue, events);
390 spin_unlock_irqrestore(&wqueue->lock, flags);
391}
392#else
393static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
394 unsigned long events, int subclass)
395{
396 wake_up_poll(wqueue, events);
397}
398#endif
399
400static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
401{
402 ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
403 1 + call_nests);
404 return 0;
405}
406
407
408
409
410
411
412
413
414
415
416
417static void ep_poll_safewake(wait_queue_head_t *wq)
418{
419 int this_cpu = get_cpu();
420
421 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
422 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
423
424 put_cpu();
425}
426
427
428
429
430
431
432static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
433{
434 struct list_head *lsthead = &epi->pwqlist;
435 struct eppoll_entry *pwq;
436
437 while (!list_empty(lsthead)) {
438 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
439
440 list_del(&pwq->llink);
441 remove_wait_queue(pwq->whead, &pwq->wait);
442 kmem_cache_free(pwq_cache, pwq);
443 }
444}
445
446
447
448
449
450
451
452
453
454
455
456
457static int ep_scan_ready_list(struct eventpoll *ep,
458 int (*sproc)(struct eventpoll *,
459 struct list_head *, void *),
460 void *priv)
461{
462 int error, pwake = 0;
463 unsigned long flags;
464 struct epitem *epi, *nepi;
465 LIST_HEAD(txlist);
466
467
468
469
470
471 mutex_lock(&ep->mtx);
472
473
474
475
476
477
478
479
480
481 spin_lock_irqsave(&ep->lock, flags);
482 list_splice_init(&ep->rdllist, &txlist);
483 ep->ovflist = NULL;
484 spin_unlock_irqrestore(&ep->lock, flags);
485
486
487
488
489 error = (*sproc)(ep, &txlist, priv);
490
491 spin_lock_irqsave(&ep->lock, flags);
492
493
494
495
496
497 for (nepi = ep->ovflist; (epi = nepi) != NULL;
498 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
499
500
501
502
503
504
505 if (!ep_is_linked(&epi->rdllink))
506 list_add_tail(&epi->rdllink, &ep->rdllist);
507 }
508
509
510
511
512
513 ep->ovflist = EP_UNACTIVE_PTR;
514
515
516
517
518 list_splice(&txlist, &ep->rdllist);
519
520 if (!list_empty(&ep->rdllist)) {
521
522
523
524
525 if (waitqueue_active(&ep->wq))
526 wake_up_locked(&ep->wq);
527 if (waitqueue_active(&ep->poll_wait))
528 pwake++;
529 }
530 spin_unlock_irqrestore(&ep->lock, flags);
531
532 mutex_unlock(&ep->mtx);
533
534
535 if (pwake)
536 ep_poll_safewake(&ep->poll_wait);
537
538 return error;
539}
540
541
542
543
544
545static int ep_remove(struct eventpoll *ep, struct epitem *epi)
546{
547 unsigned long flags;
548 struct file *file = epi->ffd.file;
549
550
551
552
553
554
555
556
557
558 ep_unregister_pollwait(ep, epi);
559
560
561 spin_lock(&file->f_lock);
562 if (ep_is_linked(&epi->fllink))
563 list_del_init(&epi->fllink);
564 spin_unlock(&file->f_lock);
565
566 rb_erase(&epi->rbn, &ep->rbr);
567
568 spin_lock_irqsave(&ep->lock, flags);
569 if (ep_is_linked(&epi->rdllink))
570 list_del_init(&epi->rdllink);
571 spin_unlock_irqrestore(&ep->lock, flags);
572
573
574 kmem_cache_free(epi_cache, epi);
575
576 atomic_long_dec(&ep->user->epoll_watches);
577
578 return 0;
579}
580
581static void ep_free(struct eventpoll *ep)
582{
583 struct rb_node *rbp;
584 struct epitem *epi;
585
586
587 if (waitqueue_active(&ep->poll_wait))
588 ep_poll_safewake(&ep->poll_wait);
589
590
591
592
593
594
595
596
597
598 mutex_lock(&epmutex);
599
600
601
602
603 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
604 epi = rb_entry(rbp, struct epitem, rbn);
605
606 ep_unregister_pollwait(ep, epi);
607 }
608
609
610
611
612
613
614
615 while ((rbp = rb_first(&ep->rbr)) != NULL) {
616 epi = rb_entry(rbp, struct epitem, rbn);
617 ep_remove(ep, epi);
618 }
619
620 mutex_unlock(&epmutex);
621 mutex_destroy(&ep->mtx);
622 free_uid(ep->user);
623 kfree(ep);
624}
625
626static int ep_eventpoll_release(struct inode *inode, struct file *file)
627{
628 struct eventpoll *ep = file->private_data;
629
630 if (ep)
631 ep_free(ep);
632
633 return 0;
634}
635
636static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
637 void *priv)
638{
639 struct epitem *epi, *tmp;
640
641 list_for_each_entry_safe(epi, tmp, head, rdllink) {
642 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
643 epi->event.events)
644 return POLLIN | POLLRDNORM;
645 else {
646
647
648
649
650
651 list_del_init(&epi->rdllink);
652 }
653 }
654
655 return 0;
656}
657
658static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
659{
660 return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
661}
662
663static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
664{
665 int pollflags;
666 struct eventpoll *ep = file->private_data;
667
668
669 poll_wait(file, &ep->poll_wait, wait);
670
671
672
673
674
675
676
677 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
678 ep_poll_readyevents_proc, ep, ep, current);
679
680 return pollflags != -1 ? pollflags : 0;
681}
682
683
684static const struct file_operations eventpoll_fops = {
685 .release = ep_eventpoll_release,
686 .poll = ep_eventpoll_poll,
687 .llseek = noop_llseek,
688};
689
690
691static inline int is_file_epoll(struct file *f)
692{
693 return f->f_op == &eventpoll_fops;
694}
695
696
697
698
699
700
701void eventpoll_release_file(struct file *file)
702{
703 struct list_head *lsthead = &file->f_ep_links;
704 struct eventpoll *ep;
705 struct epitem *epi;
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720 mutex_lock(&epmutex);
721
722 while (!list_empty(lsthead)) {
723 epi = list_first_entry(lsthead, struct epitem, fllink);
724
725 ep = epi->ep;
726 list_del_init(&epi->fllink);
727 mutex_lock(&ep->mtx);
728 ep_remove(ep, epi);
729 mutex_unlock(&ep->mtx);
730 }
731
732 mutex_unlock(&epmutex);
733}
734
735static int ep_alloc(struct eventpoll **pep)
736{
737 int error;
738 struct user_struct *user;
739 struct eventpoll *ep;
740
741 user = get_current_user();
742 error = -ENOMEM;
743 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
744 if (unlikely(!ep))
745 goto free_uid;
746
747 spin_lock_init(&ep->lock);
748 mutex_init(&ep->mtx);
749 init_waitqueue_head(&ep->wq);
750 init_waitqueue_head(&ep->poll_wait);
751 INIT_LIST_HEAD(&ep->rdllist);
752 ep->rbr = RB_ROOT;
753 ep->ovflist = EP_UNACTIVE_PTR;
754 ep->user = user;
755
756 *pep = ep;
757
758 return 0;
759
760free_uid:
761 free_uid(user);
762 return error;
763}
764
765
766
767
768
769
770static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
771{
772 int kcmp;
773 struct rb_node *rbp;
774 struct epitem *epi, *epir = NULL;
775 struct epoll_filefd ffd;
776
777 ep_set_ffd(&ffd, file, fd);
778 for (rbp = ep->rbr.rb_node; rbp; ) {
779 epi = rb_entry(rbp, struct epitem, rbn);
780 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
781 if (kcmp > 0)
782 rbp = rbp->rb_right;
783 else if (kcmp < 0)
784 rbp = rbp->rb_left;
785 else {
786 epir = epi;
787 break;
788 }
789 }
790
791 return epir;
792}
793
794
795
796
797
798
799static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
800{
801 int pwake = 0;
802 unsigned long flags;
803 struct epitem *epi = ep_item_from_wait(wait);
804 struct eventpoll *ep = epi->ep;
805
806 spin_lock_irqsave(&ep->lock, flags);
807
808
809
810
811
812
813
814 if (!(epi->event.events & ~EP_PRIVATE_BITS))
815 goto out_unlock;
816
817
818
819
820
821
822
823 if (key && !((unsigned long) key & epi->event.events))
824 goto out_unlock;
825
826
827
828
829
830
831
832 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
833 if (epi->next == EP_UNACTIVE_PTR) {
834 epi->next = ep->ovflist;
835 ep->ovflist = epi;
836 }
837 goto out_unlock;
838 }
839
840
841 if (!ep_is_linked(&epi->rdllink))
842 list_add_tail(&epi->rdllink, &ep->rdllist);
843
844
845
846
847
848 if (waitqueue_active(&ep->wq))
849 wake_up_locked(&ep->wq);
850 if (waitqueue_active(&ep->poll_wait))
851 pwake++;
852
853out_unlock:
854 spin_unlock_irqrestore(&ep->lock, flags);
855
856
857 if (pwake)
858 ep_poll_safewake(&ep->poll_wait);
859
860 return 1;
861}
862
863
864
865
866
867static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
868 poll_table *pt)
869{
870 struct epitem *epi = ep_item_from_epqueue(pt);
871 struct eppoll_entry *pwq;
872
873 if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
874 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
875 pwq->whead = whead;
876 pwq->base = epi;
877 add_wait_queue(whead, &pwq->wait);
878 list_add_tail(&pwq->llink, &epi->pwqlist);
879 epi->nwait++;
880 } else {
881
882 epi->nwait = -1;
883 }
884}
885
886static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
887{
888 int kcmp;
889 struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
890 struct epitem *epic;
891
892 while (*p) {
893 parent = *p;
894 epic = rb_entry(parent, struct epitem, rbn);
895 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
896 if (kcmp > 0)
897 p = &parent->rb_right;
898 else
899 p = &parent->rb_left;
900 }
901 rb_link_node(&epi->rbn, parent, p);
902 rb_insert_color(&epi->rbn, &ep->rbr);
903}
904
905
906
907
908static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
909 struct file *tfile, int fd)
910{
911 int error, revents, pwake = 0;
912 unsigned long flags;
913 long user_watches;
914 struct epitem *epi;
915 struct ep_pqueue epq;
916
917 user_watches = atomic_long_read(&ep->user->epoll_watches);
918 if (unlikely(user_watches >= max_user_watches))
919 return -ENOSPC;
920 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
921 return -ENOMEM;
922
923
924 INIT_LIST_HEAD(&epi->rdllink);
925 INIT_LIST_HEAD(&epi->fllink);
926 INIT_LIST_HEAD(&epi->pwqlist);
927 epi->ep = ep;
928 ep_set_ffd(&epi->ffd, tfile, fd);
929 epi->event = *event;
930 epi->nwait = 0;
931 epi->next = EP_UNACTIVE_PTR;
932
933
934 epq.epi = epi;
935 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
936
937
938
939
940
941
942
943
944 revents = tfile->f_op->poll(tfile, &epq.pt);
945
946
947
948
949
950
951 error = -ENOMEM;
952 if (epi->nwait < 0)
953 goto error_unregister;
954
955
956 spin_lock(&tfile->f_lock);
957 list_add_tail(&epi->fllink, &tfile->f_ep_links);
958 spin_unlock(&tfile->f_lock);
959
960
961
962
963
964 ep_rbtree_insert(ep, epi);
965
966
967 spin_lock_irqsave(&ep->lock, flags);
968
969
970 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
971 list_add_tail(&epi->rdllink, &ep->rdllist);
972
973
974 if (waitqueue_active(&ep->wq))
975 wake_up_locked(&ep->wq);
976 if (waitqueue_active(&ep->poll_wait))
977 pwake++;
978 }
979
980 spin_unlock_irqrestore(&ep->lock, flags);
981
982 atomic_long_inc(&ep->user->epoll_watches);
983
984
985 if (pwake)
986 ep_poll_safewake(&ep->poll_wait);
987
988 return 0;
989
990error_unregister:
991 ep_unregister_pollwait(ep, epi);
992
993
994
995
996
997
998
999 spin_lock_irqsave(&ep->lock, flags);
1000 if (ep_is_linked(&epi->rdllink))
1001 list_del_init(&epi->rdllink);
1002 spin_unlock_irqrestore(&ep->lock, flags);
1003
1004 kmem_cache_free(epi_cache, epi);
1005
1006 return error;
1007}
1008
1009
1010
1011
1012
1013static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
1014{
1015 int pwake = 0;
1016 unsigned int revents;
1017
1018
1019
1020
1021
1022
1023 epi->event.events = event->events;
1024 epi->event.data = event->data;
1025
1026
1027
1028
1029
1030 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
1031
1032
1033
1034
1035
1036 if (revents & event->events) {
1037 spin_lock_irq(&ep->lock);
1038 if (!ep_is_linked(&epi->rdllink)) {
1039 list_add_tail(&epi->rdllink, &ep->rdllist);
1040
1041
1042 if (waitqueue_active(&ep->wq))
1043 wake_up_locked(&ep->wq);
1044 if (waitqueue_active(&ep->poll_wait))
1045 pwake++;
1046 }
1047 spin_unlock_irq(&ep->lock);
1048 }
1049
1050
1051 if (pwake)
1052 ep_poll_safewake(&ep->poll_wait);
1053
1054 return 0;
1055}
1056
1057static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1058 void *priv)
1059{
1060 struct ep_send_events_data *esed = priv;
1061 int eventcnt;
1062 unsigned int revents;
1063 struct epitem *epi;
1064 struct epoll_event __user *uevent;
1065
1066
1067
1068
1069
1070
1071 for (eventcnt = 0, uevent = esed->events;
1072 !list_empty(head) && eventcnt < esed->maxevents;) {
1073 epi = list_first_entry(head, struct epitem, rdllink);
1074
1075 list_del_init(&epi->rdllink);
1076
1077 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
1078 epi->event.events;
1079
1080
1081
1082
1083
1084
1085
1086 if (revents) {
1087 if (__put_user(revents, &uevent->events) ||
1088 __put_user(epi->event.data, &uevent->data)) {
1089 list_add(&epi->rdllink, head);
1090 return eventcnt ? eventcnt : -EFAULT;
1091 }
1092 eventcnt++;
1093 uevent++;
1094 if (epi->event.events & EPOLLONESHOT)
1095 epi->event.events &= EP_PRIVATE_BITS;
1096 else if (!(epi->event.events & EPOLLET)) {
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108 list_add_tail(&epi->rdllink, &ep->rdllist);
1109 }
1110 }
1111 }
1112
1113 return eventcnt;
1114}
1115
1116static int ep_send_events(struct eventpoll *ep,
1117 struct epoll_event __user *events, int maxevents)
1118{
1119 struct ep_send_events_data esed;
1120
1121 esed.maxevents = maxevents;
1122 esed.events = events;
1123
1124 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1125}
1126
1127static inline struct timespec ep_set_mstimeout(long ms)
1128{
1129 struct timespec now, ts = {
1130 .tv_sec = ms / MSEC_PER_SEC,
1131 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1132 };
1133
1134 ktime_get_ts(&now);
1135 return timespec_add_safe(now, ts);
1136}
1137
1138static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1139 int maxevents, long timeout)
1140{
1141 int res, eavail, timed_out = 0;
1142 unsigned long flags;
1143 long slack;
1144 wait_queue_t wait;
1145 ktime_t expires, *to = NULL;
1146
1147 if (timeout > 0) {
1148 struct timespec end_time = ep_set_mstimeout(timeout);
1149
1150 slack = select_estimate_accuracy(&end_time);
1151 to = &expires;
1152 *to = timespec_to_ktime(end_time);
1153 } else if (timeout == 0) {
1154 timed_out = 1;
1155 }
1156
1157retry:
1158 spin_lock_irqsave(&ep->lock, flags);
1159
1160 res = 0;
1161 if (list_empty(&ep->rdllist)) {
1162
1163
1164
1165
1166
1167 init_waitqueue_entry(&wait, current);
1168 __add_wait_queue_exclusive(&ep->wq, &wait);
1169
1170 for (;;) {
1171
1172
1173
1174
1175
1176 set_current_state(TASK_INTERRUPTIBLE);
1177 if (!list_empty(&ep->rdllist) || timed_out)
1178 break;
1179 if (signal_pending(current)) {
1180 res = -EINTR;
1181 break;
1182 }
1183
1184 spin_unlock_irqrestore(&ep->lock, flags);
1185 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1186 timed_out = 1;
1187
1188 spin_lock_irqsave(&ep->lock, flags);
1189 }
1190 __remove_wait_queue(&ep->wq, &wait);
1191
1192 set_current_state(TASK_RUNNING);
1193 }
1194
1195 eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
1196
1197 spin_unlock_irqrestore(&ep->lock, flags);
1198
1199
1200
1201
1202
1203
1204 if (!res && eavail &&
1205 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1206 goto retry;
1207
1208 return res;
1209}
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1227{
1228 int error = 0;
1229 struct file *file = priv;
1230 struct eventpoll *ep = file->private_data;
1231 struct rb_node *rbp;
1232 struct epitem *epi;
1233
1234 mutex_lock(&ep->mtx);
1235 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1236 epi = rb_entry(rbp, struct epitem, rbn);
1237 if (unlikely(is_file_epoll(epi->ffd.file))) {
1238 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1239 ep_loop_check_proc, epi->ffd.file,
1240 epi->ffd.file->private_data, current);
1241 if (error != 0)
1242 break;
1243 }
1244 }
1245 mutex_unlock(&ep->mtx);
1246
1247 return error;
1248}
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261static int ep_loop_check(struct eventpoll *ep, struct file *file)
1262{
1263 return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1264 ep_loop_check_proc, file, ep, current);
1265}
1266
1267
1268
1269
1270SYSCALL_DEFINE1(epoll_create1, int, flags)
1271{
1272 int error;
1273 struct eventpoll *ep = NULL;
1274
1275
1276 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1277
1278 if (flags & ~EPOLL_CLOEXEC)
1279 return -EINVAL;
1280
1281
1282
1283 error = ep_alloc(&ep);
1284 if (error < 0)
1285 return error;
1286
1287
1288
1289
1290 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1291 O_RDWR | (flags & O_CLOEXEC));
1292 if (error < 0)
1293 ep_free(ep);
1294
1295 return error;
1296}
1297
1298SYSCALL_DEFINE1(epoll_create, int, size)
1299{
1300 if (size <= 0)
1301 return -EINVAL;
1302
1303 return sys_epoll_create1(0);
1304}
1305
1306
1307
1308
1309
1310
1311SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1312 struct epoll_event __user *, event)
1313{
1314 int error;
1315 int did_lock_epmutex = 0;
1316 struct file *file, *tfile;
1317 struct eventpoll *ep;
1318 struct epitem *epi;
1319 struct epoll_event epds;
1320
1321 error = -EFAULT;
1322 if (ep_op_has_event(op) &&
1323 copy_from_user(&epds, event, sizeof(struct epoll_event)))
1324 goto error_return;
1325
1326
1327 error = -EBADF;
1328 file = fget(epfd);
1329 if (!file)
1330 goto error_return;
1331
1332
1333 tfile = fget(fd);
1334 if (!tfile)
1335 goto error_fput;
1336
1337
1338 error = -EPERM;
1339 if (!tfile->f_op || !tfile->f_op->poll)
1340 goto error_tgt_fput;
1341
1342
1343
1344
1345
1346
1347 error = -EINVAL;
1348 if (file == tfile || !is_file_epoll(file))
1349 goto error_tgt_fput;
1350
1351
1352
1353
1354
1355 ep = file->private_data;
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
1368 mutex_lock(&epmutex);
1369 did_lock_epmutex = 1;
1370 error = -ELOOP;
1371 if (ep_loop_check(ep, tfile) != 0)
1372 goto error_tgt_fput;
1373 }
1374
1375
1376 mutex_lock(&ep->mtx);
1377
1378
1379
1380
1381
1382
1383 epi = ep_find(ep, tfile, fd);
1384
1385 error = -EINVAL;
1386 switch (op) {
1387 case EPOLL_CTL_ADD:
1388 if (!epi) {
1389 epds.events |= POLLERR | POLLHUP;
1390 error = ep_insert(ep, &epds, tfile, fd);
1391 } else
1392 error = -EEXIST;
1393 break;
1394 case EPOLL_CTL_DEL:
1395 if (epi)
1396 error = ep_remove(ep, epi);
1397 else
1398 error = -ENOENT;
1399 break;
1400 case EPOLL_CTL_MOD:
1401 if (epi) {
1402 epds.events |= POLLERR | POLLHUP;
1403 error = ep_modify(ep, epi, &epds);
1404 } else
1405 error = -ENOENT;
1406 break;
1407 }
1408 mutex_unlock(&ep->mtx);
1409
1410error_tgt_fput:
1411 if (unlikely(did_lock_epmutex))
1412 mutex_unlock(&epmutex);
1413
1414 fput(tfile);
1415error_fput:
1416 fput(file);
1417error_return:
1418
1419 return error;
1420}
1421
1422
1423
1424
1425
1426SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1427 int, maxevents, int, timeout)
1428{
1429 int error;
1430 struct file *file;
1431 struct eventpoll *ep;
1432
1433
1434 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1435 return -EINVAL;
1436
1437
1438 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
1439 error = -EFAULT;
1440 goto error_return;
1441 }
1442
1443
1444 error = -EBADF;
1445 file = fget(epfd);
1446 if (!file)
1447 goto error_return;
1448
1449
1450
1451
1452
1453 error = -EINVAL;
1454 if (!is_file_epoll(file))
1455 goto error_fput;
1456
1457
1458
1459
1460
1461 ep = file->private_data;
1462
1463
1464 error = ep_poll(ep, events, maxevents, timeout);
1465
1466error_fput:
1467 fput(file);
1468error_return:
1469
1470 return error;
1471}
1472
1473#ifdef HAVE_SET_RESTORE_SIGMASK
1474
1475
1476
1477
1478
1479SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1480 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
1481 size_t, sigsetsize)
1482{
1483 int error;
1484 sigset_t ksigmask, sigsaved;
1485
1486
1487
1488
1489
1490 if (sigmask) {
1491 if (sigsetsize != sizeof(sigset_t))
1492 return -EINVAL;
1493 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1494 return -EFAULT;
1495 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
1496 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1497 }
1498
1499 error = sys_epoll_wait(epfd, events, maxevents, timeout);
1500
1501
1502
1503
1504
1505
1506
1507 if (sigmask) {
1508 if (error == -EINTR) {
1509 memcpy(¤t->saved_sigmask, &sigsaved,
1510 sizeof(sigsaved));
1511 set_restore_sigmask();
1512 } else
1513 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1514 }
1515
1516 return error;
1517}
1518
1519#endif
1520
1521static int __init eventpoll_init(void)
1522{
1523 struct sysinfo si;
1524
1525 si_meminfo(&si);
1526
1527
1528
1529 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1530 EP_ITEM_COST;
1531 BUG_ON(max_user_watches < 0);
1532
1533
1534
1535
1536
1537 ep_nested_calls_init(&poll_loop_ncalls);
1538
1539
1540 ep_nested_calls_init(&poll_safewake_ncalls);
1541
1542
1543 ep_nested_calls_init(&poll_readywalk_ncalls);
1544
1545
1546 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
1547 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1548
1549
1550 pwq_cache = kmem_cache_create("eventpoll_pwq",
1551 sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
1552
1553 return 0;
1554}
1555fs_initcall(eventpoll_init);
1556