linux/fs/eventfd.c
<<
>>
Prefs
   1/*
   2 *  fs/eventfd.c
   3 *
   4 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
   5 *
   6 */
   7
   8#include <linux/file.h>
   9#include <linux/poll.h>
  10#include <linux/init.h>
  11#include <linux/fs.h>
  12#include <linux/sched.h>
  13#include <linux/kernel.h>
  14#include <linux/slab.h>
  15#include <linux/list.h>
  16#include <linux/spinlock.h>
  17#include <linux/anon_inodes.h>
  18#include <linux/syscalls.h>
  19#include <linux/export.h>
  20#include <linux/kref.h>
  21#include <linux/eventfd.h>
  22#include <linux/proc_fs.h>
  23#include <linux/seq_file.h>
  24
  25struct eventfd_ctx {
  26        struct kref kref;
  27        wait_queue_head_t wqh;
  28        /*
  29         * Every time that a write(2) is performed on an eventfd, the
  30         * value of the __u64 being written is added to "count" and a
  31         * wakeup is performed on "wqh". A read(2) will return the "count"
  32         * value to userspace, and will reset "count" to zero. The kernel
  33         * side eventfd_signal() also, adds to the "count" counter and
  34         * issue a wakeup.
  35         */
  36        __u64 count;
  37        unsigned int flags;
  38};
  39
  40/**
  41 * eventfd_signal - Adds @n to the eventfd counter.
  42 * @ctx: [in] Pointer to the eventfd context.
  43 * @n: [in] Value of the counter to be added to the eventfd internal counter.
  44 *          The value cannot be negative.
  45 *
  46 * This function is supposed to be called by the kernel in paths that do not
  47 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
  48 * value, and we signal this as overflow condition by returning a POLLERR
  49 * to poll(2).
  50 *
  51 * Returns the amount by which the counter was incremented.  This will be less
  52 * than @n if the counter has overflowed.
  53 */
  54__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
  55{
  56        unsigned long flags;
  57
  58        spin_lock_irqsave(&ctx->wqh.lock, flags);
  59        if (ULLONG_MAX - ctx->count < n)
  60                n = ULLONG_MAX - ctx->count;
  61        ctx->count += n;
  62        if (waitqueue_active(&ctx->wqh))
  63                wake_up_locked_poll(&ctx->wqh, POLLIN);
  64        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  65
  66        return n;
  67}
  68EXPORT_SYMBOL_GPL(eventfd_signal);
  69
  70static void eventfd_free_ctx(struct eventfd_ctx *ctx)
  71{
  72        kfree(ctx);
  73}
  74
  75static void eventfd_free(struct kref *kref)
  76{
  77        struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
  78
  79        eventfd_free_ctx(ctx);
  80}
  81
  82/**
  83 * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
  84 * @ctx: [in] Pointer to the eventfd context.
  85 *
  86 * Returns: In case of success, returns a pointer to the eventfd context.
  87 */
  88struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
  89{
  90        kref_get(&ctx->kref);
  91        return ctx;
  92}
  93EXPORT_SYMBOL_GPL(eventfd_ctx_get);
  94
  95/**
  96 * eventfd_ctx_put - Releases a reference to the internal eventfd context.
  97 * @ctx: [in] Pointer to eventfd context.
  98 *
  99 * The eventfd context reference must have been previously acquired either
 100 * with eventfd_ctx_get() or eventfd_ctx_fdget().
 101 */
 102void eventfd_ctx_put(struct eventfd_ctx *ctx)
 103{
 104        kref_put(&ctx->kref, eventfd_free);
 105}
 106EXPORT_SYMBOL_GPL(eventfd_ctx_put);
 107
 108static int eventfd_release(struct inode *inode, struct file *file)
 109{
 110        struct eventfd_ctx *ctx = file->private_data;
 111
 112        wake_up_poll(&ctx->wqh, POLLHUP);
 113        eventfd_ctx_put(ctx);
 114        return 0;
 115}
 116
 117static unsigned int eventfd_poll(struct file *file, poll_table *wait)
 118{
 119        struct eventfd_ctx *ctx = file->private_data;
 120        unsigned int events = 0;
 121        u64 count;
 122
 123        poll_wait(file, &ctx->wqh, wait);
 124
 125        /*
 126         * All writes to ctx->count occur within ctx->wqh.lock.  This read
 127         * can be done outside ctx->wqh.lock because we know that poll_wait
 128         * takes that lock (through add_wait_queue) if our caller will sleep.
 129         *
 130         * The read _can_ therefore seep into add_wait_queue's critical
 131         * section, but cannot move above it!  add_wait_queue's spin_lock acts
 132         * as an acquire barrier and ensures that the read be ordered properly
 133         * against the writes.  The following CAN happen and is safe:
 134         *
 135         *     poll                               write
 136         *     -----------------                  ------------
 137         *     lock ctx->wqh.lock (in poll_wait)
 138         *     count = ctx->count
 139         *     __add_wait_queue
 140         *     unlock ctx->wqh.lock
 141         *                                        lock ctx->qwh.lock
 142         *                                        ctx->count += n
 143         *                                        if (waitqueue_active)
 144         *                                          wake_up_locked_poll
 145         *                                        unlock ctx->qwh.lock
 146         *     eventfd_poll returns 0
 147         *
 148         * but the following, which would miss a wakeup, cannot happen:
 149         *
 150         *     poll                               write
 151         *     -----------------                  ------------
 152         *     count = ctx->count (INVALID!)
 153         *                                        lock ctx->qwh.lock
 154         *                                        ctx->count += n
 155         *                                        **waitqueue_active is false**
 156         *                                        **no wake_up_locked_poll!**
 157         *                                        unlock ctx->qwh.lock
 158         *     lock ctx->wqh.lock (in poll_wait)
 159         *     __add_wait_queue
 160         *     unlock ctx->wqh.lock
 161         *     eventfd_poll returns 0
 162         */
 163        count = READ_ONCE(ctx->count);
 164
 165        if (count > 0)
 166                events |= POLLIN;
 167        if (count == ULLONG_MAX)
 168                events |= POLLERR;
 169        if (ULLONG_MAX - 1 > count)
 170                events |= POLLOUT;
 171
 172        return events;
 173}
 174
 175static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 176{
 177        *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 178        ctx->count -= *cnt;
 179}
 180
 181/**
 182 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 183 * @ctx: [in] Pointer to eventfd context.
 184 * @wait: [in] Wait queue to be removed.
 185 * @cnt: [out] Pointer to the 64-bit counter value.
 186 *
 187 * Returns %0 if successful, or the following error codes:
 188 *
 189 * -EAGAIN      : The operation would have blocked.
 190 *
 191 * This is used to atomically remove a wait queue entry from the eventfd wait
 192 * queue head, and read/reset the counter value.
 193 */
 194int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
 195                                  __u64 *cnt)
 196{
 197        unsigned long flags;
 198
 199        spin_lock_irqsave(&ctx->wqh.lock, flags);
 200        eventfd_ctx_do_read(ctx, cnt);
 201        __remove_wait_queue(&ctx->wqh, wait);
 202        if (*cnt != 0 && waitqueue_active(&ctx->wqh))
 203                wake_up_locked_poll(&ctx->wqh, POLLOUT);
 204        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 205
 206        return *cnt != 0 ? 0 : -EAGAIN;
 207}
 208EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 209
 210/**
 211 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
 212 * @ctx: [in] Pointer to eventfd context.
 213 * @no_wait: [in] Different from zero if the operation should not block.
 214 * @cnt: [out] Pointer to the 64-bit counter value.
 215 *
 216 * Returns %0 if successful, or the following error codes:
 217 *
 218 * -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
 219 * -ERESTARTSYS : A signal interrupted the wait operation.
 220 *
 221 * If @no_wait is zero, the function might sleep until the eventfd internal
 222 * counter becomes greater than zero.
 223 */
 224ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
 225{
 226        ssize_t res;
 227        DECLARE_WAITQUEUE(wait, current);
 228
 229        spin_lock_irq(&ctx->wqh.lock);
 230        *cnt = 0;
 231        res = -EAGAIN;
 232        if (ctx->count > 0)
 233                res = 0;
 234        else if (!no_wait) {
 235                __add_wait_queue(&ctx->wqh, &wait);
 236                for (;;) {
 237                        set_current_state(TASK_INTERRUPTIBLE);
 238                        if (ctx->count > 0) {
 239                                res = 0;
 240                                break;
 241                        }
 242                        if (signal_pending(current)) {
 243                                res = -ERESTARTSYS;
 244                                break;
 245                        }
 246                        spin_unlock_irq(&ctx->wqh.lock);
 247                        schedule();
 248                        spin_lock_irq(&ctx->wqh.lock);
 249                }
 250                __remove_wait_queue(&ctx->wqh, &wait);
 251                __set_current_state(TASK_RUNNING);
 252        }
 253        if (likely(res == 0)) {
 254                eventfd_ctx_do_read(ctx, cnt);
 255                if (waitqueue_active(&ctx->wqh))
 256                        wake_up_locked_poll(&ctx->wqh, POLLOUT);
 257        }
 258        spin_unlock_irq(&ctx->wqh.lock);
 259
 260        return res;
 261}
 262EXPORT_SYMBOL_GPL(eventfd_ctx_read);
 263
 264static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 265                            loff_t *ppos)
 266{
 267        struct eventfd_ctx *ctx = file->private_data;
 268        ssize_t res;
 269        __u64 cnt;
 270
 271        if (count < sizeof(cnt))
 272                return -EINVAL;
 273        res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
 274        if (res < 0)
 275                return res;
 276
 277        return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
 278}
 279
 280static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 281                             loff_t *ppos)
 282{
 283        struct eventfd_ctx *ctx = file->private_data;
 284        ssize_t res;
 285        __u64 ucnt;
 286        DECLARE_WAITQUEUE(wait, current);
 287
 288        if (count < sizeof(ucnt))
 289                return -EINVAL;
 290        if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
 291                return -EFAULT;
 292        if (ucnt == ULLONG_MAX)
 293                return -EINVAL;
 294        spin_lock_irq(&ctx->wqh.lock);
 295        res = -EAGAIN;
 296        if (ULLONG_MAX - ctx->count > ucnt)
 297                res = sizeof(ucnt);
 298        else if (!(file->f_flags & O_NONBLOCK)) {
 299                __add_wait_queue(&ctx->wqh, &wait);
 300                for (res = 0;;) {
 301                        set_current_state(TASK_INTERRUPTIBLE);
 302                        if (ULLONG_MAX - ctx->count > ucnt) {
 303                                res = sizeof(ucnt);
 304                                break;
 305                        }
 306                        if (signal_pending(current)) {
 307                                res = -ERESTARTSYS;
 308                                break;
 309                        }
 310                        spin_unlock_irq(&ctx->wqh.lock);
 311                        schedule();
 312                        spin_lock_irq(&ctx->wqh.lock);
 313                }
 314                __remove_wait_queue(&ctx->wqh, &wait);
 315                __set_current_state(TASK_RUNNING);
 316        }
 317        if (likely(res > 0)) {
 318                ctx->count += ucnt;
 319                if (waitqueue_active(&ctx->wqh))
 320                        wake_up_locked_poll(&ctx->wqh, POLLIN);
 321        }
 322        spin_unlock_irq(&ctx->wqh.lock);
 323
 324        return res;
 325}
 326
 327#ifdef CONFIG_PROC_FS
 328static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 329{
 330        struct eventfd_ctx *ctx = f->private_data;
 331
 332        spin_lock_irq(&ctx->wqh.lock);
 333        seq_printf(m, "eventfd-count: %16llx\n",
 334                   (unsigned long long)ctx->count);
 335        spin_unlock_irq(&ctx->wqh.lock);
 336}
 337#endif
 338
 339static const struct file_operations eventfd_fops = {
 340#ifdef CONFIG_PROC_FS
 341        .show_fdinfo    = eventfd_show_fdinfo,
 342#endif
 343        .release        = eventfd_release,
 344        .poll           = eventfd_poll,
 345        .read           = eventfd_read,
 346        .write          = eventfd_write,
 347        .llseek         = noop_llseek,
 348};
 349
 350/**
 351 * eventfd_fget - Acquire a reference of an eventfd file descriptor.
 352 * @fd: [in] Eventfd file descriptor.
 353 *
 354 * Returns a pointer to the eventfd file structure in case of success, or the
 355 * following error pointer:
 356 *
 357 * -EBADF    : Invalid @fd file descriptor.
 358 * -EINVAL   : The @fd file descriptor is not an eventfd file.
 359 */
 360struct file *eventfd_fget(int fd)
 361{
 362        struct file *file;
 363
 364        file = fget(fd);
 365        if (!file)
 366                return ERR_PTR(-EBADF);
 367        if (file->f_op != &eventfd_fops) {
 368                fput(file);
 369                return ERR_PTR(-EINVAL);
 370        }
 371
 372        return file;
 373}
 374EXPORT_SYMBOL_GPL(eventfd_fget);
 375
 376/**
 377 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
 378 * @fd: [in] Eventfd file descriptor.
 379 *
 380 * Returns a pointer to the internal eventfd context, otherwise the error
 381 * pointers returned by the following functions:
 382 *
 383 * eventfd_fget
 384 */
 385struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 386{
 387        struct eventfd_ctx *ctx;
 388        struct fd f = fdget(fd);
 389        if (!f.file)
 390                return ERR_PTR(-EBADF);
 391        ctx = eventfd_ctx_fileget(f.file);
 392        fdput(f);
 393        return ctx;
 394}
 395EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
 396
 397/**
 398 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
 399 * @file: [in] Eventfd file pointer.
 400 *
 401 * Returns a pointer to the internal eventfd context, otherwise the error
 402 * pointer:
 403 *
 404 * -EINVAL   : The @fd file descriptor is not an eventfd file.
 405 */
 406struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
 407{
 408        if (file->f_op != &eventfd_fops)
 409                return ERR_PTR(-EINVAL);
 410
 411        return eventfd_ctx_get(file->private_data);
 412}
 413EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 414
 415/**
 416 * eventfd_file_create - Creates an eventfd file pointer.
 417 * @count: Initial eventfd counter value.
 418 * @flags: Flags for the eventfd file.
 419 *
 420 * This function creates an eventfd file pointer, w/out installing it into
 421 * the fd table. This is useful when the eventfd file is used during the
 422 * initialization of data structures that require extra setup after the eventfd
 423 * creation. So the eventfd creation is split into the file pointer creation
 424 * phase, and the file descriptor installation phase.
 425 * In this way races with userspace closing the newly installed file descriptor
 426 * can be avoided.
 427 * Returns an eventfd file pointer, or a proper error pointer.
 428 */
 429struct file *eventfd_file_create(unsigned int count, int flags)
 430{
 431        struct file *file;
 432        struct eventfd_ctx *ctx;
 433
 434        /* Check the EFD_* constants for consistency.  */
 435        BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 436        BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 437
 438        if (flags & ~EFD_FLAGS_SET)
 439                return ERR_PTR(-EINVAL);
 440
 441        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 442        if (!ctx)
 443                return ERR_PTR(-ENOMEM);
 444
 445        kref_init(&ctx->kref);
 446        init_waitqueue_head(&ctx->wqh);
 447        ctx->count = count;
 448        ctx->flags = flags;
 449
 450        file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
 451                                  O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
 452        if (IS_ERR(file))
 453                eventfd_free_ctx(ctx);
 454
 455        return file;
 456}
 457
 458SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 459{
 460        int fd, error;
 461        struct file *file;
 462
 463        error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
 464        if (error < 0)
 465                return error;
 466        fd = error;
 467
 468        file = eventfd_file_create(count, flags);
 469        if (IS_ERR(file)) {
 470                error = PTR_ERR(file);
 471                goto err_put_unused_fd;
 472        }
 473        fd_install(fd, file);
 474
 475        return fd;
 476
 477err_put_unused_fd:
 478        put_unused_fd(fd);
 479
 480        return error;
 481}
 482
 483SYSCALL_DEFINE1(eventfd, unsigned int, count)
 484{
 485        return sys_eventfd2(count, 0);
 486}
 487
 488