linux/fs/select.c
<<
>>
Prefs
   1/*
   2 * This file contains the procedures for the handling of select and poll
   3 *
   4 * Created for Linux based loosely upon Mathius Lattner's minix
   5 * patches by Peter MacDonald. Heavily edited by Linus.
   6 *
   7 *  4 February 1994
   8 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
   9 *     flag set in its personality we do *not* modify the given timeout
  10 *     parameter to reflect time remaining.
  11 *
  12 *  24 January 2000
  13 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
  14 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
  15 */
  16
  17#include <linux/kernel.h>
  18#include <linux/syscalls.h>
  19#include <linux/module.h>
  20#include <linux/slab.h>
  21#include <linux/poll.h>
  22#include <linux/personality.h> /* for STICKY_TIMEOUTS */
  23#include <linux/file.h>
  24#include <linux/fs.h>
  25#include <linux/rcupdate.h>
  26
  27#include <asm/uaccess.h>
  28
  29struct poll_table_page {
  30        struct poll_table_page * next;
  31        struct poll_table_entry * entry;
  32        struct poll_table_entry entries[0];
  33};
  34
  35#define POLL_TABLE_FULL(table) \
  36        ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
  37
  38/*
  39 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
  40 * I have rewritten this, taking some shortcuts: This code may not be easy to
  41 * follow, but it should be free of race-conditions, and it's practical. If you
  42 * understand what I'm doing here, then you understand how the linux
  43 * sleep/wakeup mechanism works.
  44 *
  45 * Two very simple procedures, poll_wait() and poll_freewait() make all the
  46 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
  47 * as all select/poll functions have to call it to add an entry to the
  48 * poll table.
  49 */
  50static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
  51                       poll_table *p);
  52
  53void poll_initwait(struct poll_wqueues *pwq)
  54{
  55        init_poll_funcptr(&pwq->pt, __pollwait);
  56        pwq->error = 0;
  57        pwq->table = NULL;
  58        pwq->inline_index = 0;
  59}
  60
  61EXPORT_SYMBOL(poll_initwait);
  62
  63static void free_poll_entry(struct poll_table_entry *entry)
  64{
  65        remove_wait_queue(entry->wait_address, &entry->wait);
  66        fput(entry->filp);
  67}
  68
  69void poll_freewait(struct poll_wqueues *pwq)
  70{
  71        struct poll_table_page * p = pwq->table;
  72        int i;
  73        for (i = 0; i < pwq->inline_index; i++)
  74                free_poll_entry(pwq->inline_entries + i);
  75        while (p) {
  76                struct poll_table_entry * entry;
  77                struct poll_table_page *old;
  78
  79                entry = p->entry;
  80                do {
  81                        entry--;
  82                        free_poll_entry(entry);
  83                } while (entry > p->entries);
  84                old = p;
  85                p = p->next;
  86                free_page((unsigned long) old);
  87        }
  88}
  89
  90EXPORT_SYMBOL(poll_freewait);
  91
  92static struct poll_table_entry *poll_get_entry(poll_table *_p)
  93{
  94        struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
  95        struct poll_table_page *table = p->table;
  96
  97        if (p->inline_index < N_INLINE_POLL_ENTRIES)
  98                return p->inline_entries + p->inline_index++;
  99
 100        if (!table || POLL_TABLE_FULL(table)) {
 101                struct poll_table_page *new_table;
 102
 103                new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
 104                if (!new_table) {
 105                        p->error = -ENOMEM;
 106                        __set_current_state(TASK_RUNNING);
 107                        return NULL;
 108                }
 109                new_table->entry = new_table->entries;
 110                new_table->next = table;
 111                p->table = new_table;
 112                table = new_table;
 113        }
 114
 115        return table->entry++;
 116}
 117
 118/* Add a new entry */
 119static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 120                                poll_table *p)
 121{
 122        struct poll_table_entry *entry = poll_get_entry(p);
 123        if (!entry)
 124                return;
 125        get_file(filp);
 126        entry->filp = filp;
 127        entry->wait_address = wait_address;
 128        init_waitqueue_entry(&entry->wait, current);
 129        add_wait_queue(wait_address, &entry->wait);
 130}
 131
 132#define FDS_IN(fds, n)          (fds->in + n)
 133#define FDS_OUT(fds, n)         (fds->out + n)
 134#define FDS_EX(fds, n)          (fds->ex + n)
 135
 136#define BITS(fds, n)    (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))
 137
 138static int max_select_fd(unsigned long n, fd_set_bits *fds)
 139{
 140        unsigned long *open_fds;
 141        unsigned long set;
 142        int max;
 143        struct fdtable *fdt;
 144
 145        /* handle last in-complete long-word first */
 146        set = ~(~0UL << (n & (__NFDBITS-1)));
 147        n /= __NFDBITS;
 148        fdt = files_fdtable(current->files);
 149        open_fds = fdt->open_fds->fds_bits+n;
 150        max = 0;
 151        if (set) {
 152                set &= BITS(fds, n);
 153                if (set) {
 154                        if (!(set & ~*open_fds))
 155                                goto get_max;
 156                        return -EBADF;
 157                }
 158        }
 159        while (n) {
 160                open_fds--;
 161                n--;
 162                set = BITS(fds, n);
 163                if (!set)
 164                        continue;
 165                if (set & ~*open_fds)
 166                        return -EBADF;
 167                if (max)
 168                        continue;
 169get_max:
 170                do {
 171                        max++;
 172                        set >>= 1;
 173                } while (set);
 174                max += n * __NFDBITS;
 175        }
 176
 177        return max;
 178}
 179
 180#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
 181#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 182#define POLLEX_SET (POLLPRI)
 183
 184int do_select(int n, fd_set_bits *fds, s64 *timeout)
 185{
 186        struct poll_wqueues table;
 187        poll_table *wait;
 188        int retval, i;
 189
 190        rcu_read_lock();
 191        retval = max_select_fd(n, fds);
 192        rcu_read_unlock();
 193
 194        if (retval < 0)
 195                return retval;
 196        n = retval;
 197
 198        poll_initwait(&table);
 199        wait = &table.pt;
 200        if (!*timeout)
 201                wait = NULL;
 202        retval = 0;
 203        for (;;) {
 204                unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
 205                long __timeout;
 206
 207                set_current_state(TASK_INTERRUPTIBLE);
 208
 209                inp = fds->in; outp = fds->out; exp = fds->ex;
 210                rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
 211
 212                for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
 213                        unsigned long in, out, ex, all_bits, bit = 1, mask, j;
 214                        unsigned long res_in = 0, res_out = 0, res_ex = 0;
 215                        const struct file_operations *f_op = NULL;
 216                        struct file *file = NULL;
 217
 218                        in = *inp++; out = *outp++; ex = *exp++;
 219                        all_bits = in | out | ex;
 220                        if (all_bits == 0) {
 221                                i += __NFDBITS;
 222                                continue;
 223                        }
 224
 225                        for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
 226                                int fput_needed;
 227                                if (i >= n)
 228                                        break;
 229                                if (!(bit & all_bits))
 230                                        continue;
 231                                file = fget_light(i, &fput_needed);
 232                                if (file) {
 233                                        f_op = file->f_op;
 234                                        mask = DEFAULT_POLLMASK;
 235                                        if (f_op && f_op->poll)
 236                                                mask = (*f_op->poll)(file, retval ? NULL : wait);
 237                                        fput_light(file, fput_needed);
 238                                        if ((mask & POLLIN_SET) && (in & bit)) {
 239                                                res_in |= bit;
 240                                                retval++;
 241                                        }
 242                                        if ((mask & POLLOUT_SET) && (out & bit)) {
 243                                                res_out |= bit;
 244                                                retval++;
 245                                        }
 246                                        if ((mask & POLLEX_SET) && (ex & bit)) {
 247                                                res_ex |= bit;
 248                                                retval++;
 249                                        }
 250                                }
 251                                cond_resched();
 252                        }
 253                        if (res_in)
 254                                *rinp = res_in;
 255                        if (res_out)
 256                                *routp = res_out;
 257                        if (res_ex)
 258                                *rexp = res_ex;
 259                }
 260                wait = NULL;
 261                if (retval || !*timeout || signal_pending(current))
 262                        break;
 263                if(table.error) {
 264                        retval = table.error;
 265                        break;
 266                }
 267
 268                if (*timeout < 0) {
 269                        /* Wait indefinitely */
 270                        __timeout = MAX_SCHEDULE_TIMEOUT;
 271                } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
 272                        /* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
 273                        __timeout = MAX_SCHEDULE_TIMEOUT - 1;
 274                        *timeout -= __timeout;
 275                } else {
 276                        __timeout = *timeout;
 277                        *timeout = 0;
 278                }
 279                __timeout = schedule_timeout(__timeout);
 280                if (*timeout >= 0)
 281                        *timeout += __timeout;
 282        }
 283        __set_current_state(TASK_RUNNING);
 284
 285        poll_freewait(&table);
 286
 287        return retval;
 288}
 289
 290/*
 291 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 292 * like to be certain this leads to no problems. So I return
 293 * EINTR just for safety.
 294 *
 295 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 296 * I'm trying ERESTARTNOHAND which restart only when you want to.
 297 */
 298#define MAX_SELECT_SECONDS \
 299        ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 300
 301static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 302                           fd_set __user *exp, s64 *timeout)
 303{
 304        fd_set_bits fds;
 305        void *bits;
 306        int ret, max_fds;
 307        unsigned int size;
 308        struct fdtable *fdt;
 309        /* Allocate small arguments on the stack to save memory and be faster */
 310        long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
 311
 312        ret = -EINVAL;
 313        if (n < 0)
 314                goto out_nofds;
 315
 316        /* max_fds can increase, so grab it once to avoid race */
 317        rcu_read_lock();
 318        fdt = files_fdtable(current->files);
 319        max_fds = fdt->max_fds;
 320        rcu_read_unlock();
 321        if (n > max_fds)
 322                n = max_fds;
 323
 324        /*
 325         * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
 326         * since we used fdset we need to allocate memory in units of
 327         * long-words. 
 328         */
 329        size = FDS_BYTES(n);
 330        bits = stack_fds;
 331        if (size > sizeof(stack_fds) / 6) {
 332                /* Not enough space in on-stack array; must use kmalloc */
 333                ret = -ENOMEM;
 334                bits = kmalloc(6 * size, GFP_KERNEL);
 335                if (!bits)
 336                        goto out_nofds;
 337        }
 338        fds.in      = bits;
 339        fds.out     = bits +   size;
 340        fds.ex      = bits + 2*size;
 341        fds.res_in  = bits + 3*size;
 342        fds.res_out = bits + 4*size;
 343        fds.res_ex  = bits + 5*size;
 344
 345        if ((ret = get_fd_set(n, inp, fds.in)) ||
 346            (ret = get_fd_set(n, outp, fds.out)) ||
 347            (ret = get_fd_set(n, exp, fds.ex)))
 348                goto out;
 349        zero_fd_set(n, fds.res_in);
 350        zero_fd_set(n, fds.res_out);
 351        zero_fd_set(n, fds.res_ex);
 352
 353        ret = do_select(n, &fds, timeout);
 354
 355        if (ret < 0)
 356                goto out;
 357        if (!ret) {
 358                ret = -ERESTARTNOHAND;
 359                if (signal_pending(current))
 360                        goto out;
 361                ret = 0;
 362        }
 363
 364        if (set_fd_set(n, inp, fds.res_in) ||
 365            set_fd_set(n, outp, fds.res_out) ||
 366            set_fd_set(n, exp, fds.res_ex))
 367                ret = -EFAULT;
 368
 369out:
 370        if (bits != stack_fds)
 371                kfree(bits);
 372out_nofds:
 373        return ret;
 374}
 375
 376asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 377                        fd_set __user *exp, struct timeval __user *tvp)
 378{
 379        s64 timeout = -1;
 380        struct timeval tv;
 381        int ret;
 382
 383        if (tvp) {
 384                if (copy_from_user(&tv, tvp, sizeof(tv)))
 385                        return -EFAULT;
 386
 387                if (tv.tv_sec < 0 || tv.tv_usec < 0)
 388                        return -EINVAL;
 389
 390                /* Cast to u64 to make GCC stop complaining */
 391                if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
 392                        timeout = -1;   /* infinite */
 393                else {
 394                        timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
 395                        timeout += tv.tv_sec * HZ;
 396                }
 397        }
 398
 399        ret = core_sys_select(n, inp, outp, exp, &timeout);
 400
 401        if (tvp) {
 402                struct timeval rtv;
 403
 404                if (current->personality & STICKY_TIMEOUTS)
 405                        goto sticky;
 406                rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
 407                rtv.tv_sec = timeout;
 408                if (timeval_compare(&rtv, &tv) >= 0)
 409                        rtv = tv;
 410                if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
 411sticky:
 412                        /*
 413                         * If an application puts its timeval in read-only
 414                         * memory, we don't want the Linux-specific update to
 415                         * the timeval to cause a fault after the select has
 416                         * completed successfully. However, because we're not
 417                         * updating the timeval, we can't restart the system
 418                         * call.
 419                         */
 420                        if (ret == -ERESTARTNOHAND)
 421                                ret = -EINTR;
 422                }
 423        }
 424
 425        return ret;
 426}
 427
 428#ifdef TIF_RESTORE_SIGMASK
 429asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 430                fd_set __user *exp, struct timespec __user *tsp,
 431                const sigset_t __user *sigmask, size_t sigsetsize)
 432{
 433        s64 timeout = MAX_SCHEDULE_TIMEOUT;
 434        sigset_t ksigmask, sigsaved;
 435        struct timespec ts;
 436        int ret;
 437
 438        if (tsp) {
 439                if (copy_from_user(&ts, tsp, sizeof(ts)))
 440                        return -EFAULT;
 441
 442                if (ts.tv_sec < 0 || ts.tv_nsec < 0)
 443                        return -EINVAL;
 444
 445                /* Cast to u64 to make GCC stop complaining */
 446                if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
 447                        timeout = -1;   /* infinite */
 448                else {
 449                        timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
 450                        timeout += ts.tv_sec * HZ;
 451                }
 452        }
 453
 454        if (sigmask) {
 455                /* XXX: Don't preclude handling different sized sigset_t's.  */
 456                if (sigsetsize != sizeof(sigset_t))
 457                        return -EINVAL;
 458                if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
 459                        return -EFAULT;
 460
 461                sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 462                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 463        }
 464
 465        ret = core_sys_select(n, inp, outp, exp, &timeout);
 466
 467        if (tsp) {
 468                struct timespec rts;
 469
 470                if (current->personality & STICKY_TIMEOUTS)
 471                        goto sticky;
 472                rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
 473                                                1000;
 474                rts.tv_sec = timeout;
 475                if (timespec_compare(&rts, &ts) >= 0)
 476                        rts = ts;
 477                if (copy_to_user(tsp, &rts, sizeof(rts))) {
 478sticky:
 479                        /*
 480                         * If an application puts its timeval in read-only
 481                         * memory, we don't want the Linux-specific update to
 482                         * the timeval to cause a fault after the select has
 483                         * completed successfully. However, because we're not
 484                         * updating the timeval, we can't restart the system
 485                         * call.
 486                         */
 487                        if (ret == -ERESTARTNOHAND)
 488                                ret = -EINTR;
 489                }
 490        }
 491
 492        if (ret == -ERESTARTNOHAND) {
 493                /*
 494                 * Don't restore the signal mask yet. Let do_signal() deliver
 495                 * the signal on the way back to userspace, before the signal
 496                 * mask is restored.
 497                 */
 498                if (sigmask) {
 499                        memcpy(&current->saved_sigmask, &sigsaved,
 500                                        sizeof(sigsaved));
 501                        set_thread_flag(TIF_RESTORE_SIGMASK);
 502                }
 503        } else if (sigmask)
 504                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 505
 506        return ret;
 507}
 508
 509/*
 510 * Most architectures can't handle 7-argument syscalls. So we provide a
 511 * 6-argument version where the sixth argument is a pointer to a structure
 512 * which has a pointer to the sigset_t itself followed by a size_t containing
 513 * the sigset size.
 514 */
 515asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
 516        fd_set __user *exp, struct timespec __user *tsp, void __user *sig)
 517{
 518        size_t sigsetsize = 0;
 519        sigset_t __user *up = NULL;
 520
 521        if (sig) {
 522                if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
 523                    || __get_user(up, (sigset_t __user * __user *)sig)
 524                    || __get_user(sigsetsize,
 525                                (size_t __user *)(sig+sizeof(void *))))
 526                        return -EFAULT;
 527        }
 528
 529        return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize);
 530}
 531#endif /* TIF_RESTORE_SIGMASK */
 532
 533struct poll_list {
 534        struct poll_list *next;
 535        int len;
 536        struct pollfd entries[0];
 537};
 538
 539#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
 540
 541/*
 542 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 543 * interested in events matching the pollfd->events mask, and the result
 544 * matching that mask is both recorded in pollfd->revents and returned. The
 545 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 546 * if non-NULL.
 547 */
 548static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 549{
 550        unsigned int mask;
 551        int fd;
 552
 553        mask = 0;
 554        fd = pollfd->fd;
 555        if (fd >= 0) {
 556                int fput_needed;
 557                struct file * file;
 558
 559                file = fget_light(fd, &fput_needed);
 560                mask = POLLNVAL;
 561                if (file != NULL) {
 562                        mask = DEFAULT_POLLMASK;
 563                        if (file->f_op && file->f_op->poll)
 564                                mask = file->f_op->poll(file, pwait);
 565                        /* Mask out unneeded events. */
 566                        mask &= pollfd->events | POLLERR | POLLHUP;
 567                        fput_light(file, fput_needed);
 568                }
 569        }
 570        pollfd->revents = mask;
 571
 572        return mask;
 573}
 574
 575static int do_poll(unsigned int nfds,  struct poll_list *list,
 576                   struct poll_wqueues *wait, s64 *timeout)
 577{
 578        int count = 0;
 579        poll_table* pt = &wait->pt;
 580
 581        /* Optimise the no-wait case */
 582        if (!(*timeout))
 583                pt = NULL;
 584
 585        for (;;) {
 586                struct poll_list *walk;
 587                long __timeout;
 588
 589                set_current_state(TASK_INTERRUPTIBLE);
 590                for (walk = list; walk != NULL; walk = walk->next) {
 591                        struct pollfd * pfd, * pfd_end;
 592
 593                        pfd = walk->entries;
 594                        pfd_end = pfd + walk->len;
 595                        for (; pfd != pfd_end; pfd++) {
 596                                /*
 597                                 * Fish for events. If we found one, record it
 598                                 * and kill the poll_table, so we don't
 599                                 * needlessly register any other waiters after
 600                                 * this. They'll get immediately deregistered
 601                                 * when we break out and return.
 602                                 */
 603                                if (do_pollfd(pfd, pt)) {
 604                                        count++;
 605                                        pt = NULL;
 606                                }
 607                        }
 608                }
 609                /*
 610                 * All waiters have already been registered, so don't provide
 611                 * a poll_table to them on the next loop iteration.
 612                 */
 613                pt = NULL;
 614                if (!count) {
 615                        count = wait->error;
 616                        if (signal_pending(current))
 617                                count = -EINTR;
 618                }
 619                if (count || !*timeout)
 620                        break;
 621
 622                if (*timeout < 0) {
 623                        /* Wait indefinitely */
 624                        __timeout = MAX_SCHEDULE_TIMEOUT;
 625                } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) {
 626                        /*
 627                         * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in
 628                         * a loop
 629                         */
 630                        __timeout = MAX_SCHEDULE_TIMEOUT - 1;
 631                        *timeout -= __timeout;
 632                } else {
 633                        __timeout = *timeout;
 634                        *timeout = 0;
 635                }
 636
 637                __timeout = schedule_timeout(__timeout);
 638                if (*timeout >= 0)
 639                        *timeout += __timeout;
 640        }
 641        __set_current_state(TASK_RUNNING);
 642        return count;
 643}
 644
 645#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
 646                        sizeof(struct pollfd))
 647
 648int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 649{
 650        struct poll_wqueues table;
 651        int err = -EFAULT, fdcount, len, size;
 652        /* Allocate small arguments on the stack to save memory and be
 653           faster - use long to make sure the buffer is aligned properly
 654           on 64 bit archs to avoid unaligned access */
 655        long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
 656        struct poll_list *const head = (struct poll_list *)stack_pps;
 657        struct poll_list *walk = head;
 658        unsigned long todo = nfds;
 659
 660        if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 661                return -EINVAL;
 662
 663        len = min_t(unsigned int, nfds, N_STACK_PPS);
 664        for (;;) {
 665                walk->next = NULL;
 666                walk->len = len;
 667                if (!len)
 668                        break;
 669
 670                if (copy_from_user(walk->entries, ufds + nfds-todo,
 671                                        sizeof(struct pollfd) * walk->len))
 672                        goto out_fds;
 673
 674                todo -= walk->len;
 675                if (!todo)
 676                        break;
 677
 678                len = min(todo, POLLFD_PER_PAGE);
 679                size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
 680                walk = walk->next = kmalloc(size, GFP_KERNEL);
 681                if (!walk) {
 682                        err = -ENOMEM;
 683                        goto out_fds;
 684                }
 685        }
 686
 687        poll_initwait(&table);
 688        fdcount = do_poll(nfds, head, &table, timeout);
 689        poll_freewait(&table);
 690
 691        for (walk = head; walk; walk = walk->next) {
 692                struct pollfd *fds = walk->entries;
 693                int j;
 694
 695                for (j = 0; j < walk->len; j++, ufds++)
 696                        if (__put_user(fds[j].revents, &ufds->revents))
 697                                goto out_fds;
 698        }
 699
 700        err = fdcount;
 701out_fds:
 702        walk = head->next;
 703        while (walk) {
 704                struct poll_list *pos = walk;
 705                walk = walk->next;
 706                kfree(pos);
 707        }
 708
 709        return err;
 710}
 711
 712static long do_restart_poll(struct restart_block *restart_block)
 713{
 714        struct pollfd __user *ufds = (struct pollfd __user*)restart_block->arg0;
 715        int nfds = restart_block->arg1;
 716        s64 timeout = ((s64)restart_block->arg3<<32) | (s64)restart_block->arg2;
 717        int ret;
 718
 719        ret = do_sys_poll(ufds, nfds, &timeout);
 720        if (ret == -EINTR) {
 721                restart_block->fn = do_restart_poll;
 722                restart_block->arg2 = timeout & 0xFFFFFFFF;
 723                restart_block->arg3 = (u64)timeout >> 32;
 724                ret = -ERESTART_RESTARTBLOCK;
 725        }
 726        return ret;
 727}
 728
 729asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 730                        long timeout_msecs)
 731{
 732        s64 timeout_jiffies;
 733        int ret;
 734
 735        if (timeout_msecs > 0) {
 736#if HZ > 1000
 737                /* We can only overflow if HZ > 1000 */
 738                if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
 739                        timeout_jiffies = -1;
 740                else
 741#endif
 742                        timeout_jiffies = msecs_to_jiffies(timeout_msecs);
 743        } else {
 744                /* Infinite (< 0) or no (0) timeout */
 745                timeout_jiffies = timeout_msecs;
 746        }
 747
 748        ret = do_sys_poll(ufds, nfds, &timeout_jiffies);
 749        if (ret == -EINTR) {
 750                struct restart_block *restart_block;
 751                restart_block = &current_thread_info()->restart_block;
 752                restart_block->fn = do_restart_poll;
 753                restart_block->arg0 = (unsigned long)ufds;
 754                restart_block->arg1 = nfds;
 755                restart_block->arg2 = timeout_jiffies & 0xFFFFFFFF;
 756                restart_block->arg3 = (u64)timeout_jiffies >> 32;
 757                ret = -ERESTART_RESTARTBLOCK;
 758        }
 759        return ret;
 760}
 761
 762#ifdef TIF_RESTORE_SIGMASK
 763asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 764        struct timespec __user *tsp, const sigset_t __user *sigmask,
 765        size_t sigsetsize)
 766{
 767        sigset_t ksigmask, sigsaved;
 768        struct timespec ts;
 769        s64 timeout = -1;
 770        int ret;
 771
 772        if (tsp) {
 773                if (copy_from_user(&ts, tsp, sizeof(ts)))
 774                        return -EFAULT;
 775
 776                /* Cast to u64 to make GCC stop complaining */
 777                if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
 778                        timeout = -1;   /* infinite */
 779                else {
 780                        timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
 781                        timeout += ts.tv_sec * HZ;
 782                }
 783        }
 784
 785        if (sigmask) {
 786                /* XXX: Don't preclude handling different sized sigset_t's.  */
 787                if (sigsetsize != sizeof(sigset_t))
 788                        return -EINVAL;
 789                if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
 790                        return -EFAULT;
 791
 792                sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 793                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 794        }
 795
 796        ret = do_sys_poll(ufds, nfds, &timeout);
 797
 798        /* We can restart this syscall, usually */
 799        if (ret == -EINTR) {
 800                /*
 801                 * Don't restore the signal mask yet. Let do_signal() deliver
 802                 * the signal on the way back to userspace, before the signal
 803                 * mask is restored.
 804                 */
 805                if (sigmask) {
 806                        memcpy(&current->saved_sigmask, &sigsaved,
 807                                        sizeof(sigsaved));
 808                        set_thread_flag(TIF_RESTORE_SIGMASK);
 809                }
 810                ret = -ERESTARTNOHAND;
 811        } else if (sigmask)
 812                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 813
 814        if (tsp && timeout >= 0) {
 815                struct timespec rts;
 816
 817                if (current->personality & STICKY_TIMEOUTS)
 818                        goto sticky;
 819                /* Yes, we know it's actually an s64, but it's also positive. */
 820                rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
 821                                                1000;
 822                rts.tv_sec = timeout;
 823                if (timespec_compare(&rts, &ts) >= 0)
 824                        rts = ts;
 825                if (copy_to_user(tsp, &rts, sizeof(rts))) {
 826                sticky:
 827                        /*
 828                         * If an application puts its timeval in read-only
 829                         * memory, we don't want the Linux-specific update to
 830                         * the timeval to cause a fault after the select has
 831                         * completed successfully. However, because we're not
 832                         * updating the timeval, we can't restart the system
 833                         * call.
 834                         */
 835                        if (ret == -ERESTARTNOHAND && timeout >= 0)
 836                                ret = -EINTR;
 837                }
 838        }
 839
 840        return ret;
 841}
 842#endif /* TIF_RESTORE_SIGMASK */
 843