linux/fs/eventfd.c
<<
>>
Prefs
   1/*
   2 *  fs/eventfd.c
   3 *
   4 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
   5 *
   6 */
   7
   8#include <linux/file.h>
   9#include <linux/poll.h>
  10#include <linux/init.h>
  11#include <linux/fs.h>
  12#include <linux/sched/signal.h>
  13#include <linux/kernel.h>
  14#include <linux/slab.h>
  15#include <linux/list.h>
  16#include <linux/spinlock.h>
  17#include <linux/anon_inodes.h>
  18#include <linux/syscalls.h>
  19#include <linux/export.h>
  20#include <linux/kref.h>
  21#include <linux/eventfd.h>
  22#include <linux/proc_fs.h>
  23#include <linux/seq_file.h>
  24
  25struct eventfd_ctx {
  26        struct kref kref;
  27        wait_queue_head_t wqh;
  28        /*
  29         * Every time that a write(2) is performed on an eventfd, the
  30         * value of the __u64 being written is added to "count" and a
  31         * wakeup is performed on "wqh". A read(2) will return the "count"
  32         * value to userspace, and will reset "count" to zero. The kernel
  33         * side eventfd_signal() also, adds to the "count" counter and
  34         * issue a wakeup.
  35         */
  36        __u64 count;
  37        unsigned int flags;
  38};
  39
  40/**
  41 * eventfd_signal - Adds @n to the eventfd counter.
  42 * @ctx: [in] Pointer to the eventfd context.
  43 * @n: [in] Value of the counter to be added to the eventfd internal counter.
  44 *          The value cannot be negative.
  45 *
  46 * This function is supposed to be called by the kernel in paths that do not
  47 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
  48 * value, and we signal this as overflow condition by returning a EPOLLERR
  49 * to poll(2).
  50 *
  51 * Returns the amount by which the counter was incremented.  This will be less
  52 * than @n if the counter has overflowed.
  53 */
  54__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
  55{
  56        unsigned long flags;
  57
  58        spin_lock_irqsave(&ctx->wqh.lock, flags);
  59        if (ULLONG_MAX - ctx->count < n)
  60                n = ULLONG_MAX - ctx->count;
  61        ctx->count += n;
  62        if (waitqueue_active(&ctx->wqh))
  63                wake_up_locked_poll(&ctx->wqh, EPOLLIN);
  64        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  65
  66        return n;
  67}
  68EXPORT_SYMBOL_GPL(eventfd_signal);
  69
  70static void eventfd_free_ctx(struct eventfd_ctx *ctx)
  71{
  72        kfree(ctx);
  73}
  74
  75static void eventfd_free(struct kref *kref)
  76{
  77        struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
  78
  79        eventfd_free_ctx(ctx);
  80}
  81
  82/**
  83 * eventfd_ctx_put - Releases a reference to the internal eventfd context.
  84 * @ctx: [in] Pointer to eventfd context.
  85 *
  86 * The eventfd context reference must have been previously acquired either
  87 * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
  88 */
  89void eventfd_ctx_put(struct eventfd_ctx *ctx)
  90{
  91        kref_put(&ctx->kref, eventfd_free);
  92}
  93EXPORT_SYMBOL_GPL(eventfd_ctx_put);
  94
  95static int eventfd_release(struct inode *inode, struct file *file)
  96{
  97        struct eventfd_ctx *ctx = file->private_data;
  98
  99        wake_up_poll(&ctx->wqh, EPOLLHUP);
 100        eventfd_ctx_put(ctx);
 101        return 0;
 102}
 103
 104static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 105{
 106        struct eventfd_ctx *ctx = file->private_data;
 107        __poll_t events = 0;
 108        u64 count;
 109
 110        poll_wait(file, &ctx->wqh, wait);
 111
 112        /*
 113         * All writes to ctx->count occur within ctx->wqh.lock.  This read
 114         * can be done outside ctx->wqh.lock because we know that poll_wait
 115         * takes that lock (through add_wait_queue) if our caller will sleep.
 116         *
 117         * The read _can_ therefore seep into add_wait_queue's critical
 118         * section, but cannot move above it!  add_wait_queue's spin_lock acts
 119         * as an acquire barrier and ensures that the read be ordered properly
 120         * against the writes.  The following CAN happen and is safe:
 121         *
 122         *     poll                               write
 123         *     -----------------                  ------------
 124         *     lock ctx->wqh.lock (in poll_wait)
 125         *     count = ctx->count
 126         *     __add_wait_queue
 127         *     unlock ctx->wqh.lock
 128         *                                        lock ctx->qwh.lock
 129         *                                        ctx->count += n
 130         *                                        if (waitqueue_active)
 131         *                                          wake_up_locked_poll
 132         *                                        unlock ctx->qwh.lock
 133         *     eventfd_poll returns 0
 134         *
 135         * but the following, which would miss a wakeup, cannot happen:
 136         *
 137         *     poll                               write
 138         *     -----------------                  ------------
 139         *     count = ctx->count (INVALID!)
 140         *                                        lock ctx->qwh.lock
 141         *                                        ctx->count += n
 142         *                                        **waitqueue_active is false**
 143         *                                        **no wake_up_locked_poll!**
 144         *                                        unlock ctx->qwh.lock
 145         *     lock ctx->wqh.lock (in poll_wait)
 146         *     __add_wait_queue
 147         *     unlock ctx->wqh.lock
 148         *     eventfd_poll returns 0
 149         */
 150        count = READ_ONCE(ctx->count);
 151
 152        if (count > 0)
 153                events |= EPOLLIN;
 154        if (count == ULLONG_MAX)
 155                events |= EPOLLERR;
 156        if (ULLONG_MAX - 1 > count)
 157                events |= EPOLLOUT;
 158
 159        return events;
 160}
 161
 162static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 163{
 164        *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 165        ctx->count -= *cnt;
 166}
 167
 168/**
 169 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 170 * @ctx: [in] Pointer to eventfd context.
 171 * @wait: [in] Wait queue to be removed.
 172 * @cnt: [out] Pointer to the 64-bit counter value.
 173 *
 174 * Returns %0 if successful, or the following error codes:
 175 *
 176 * -EAGAIN      : The operation would have blocked.
 177 *
 178 * This is used to atomically remove a wait queue entry from the eventfd wait
 179 * queue head, and read/reset the counter value.
 180 */
 181int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 182                                  __u64 *cnt)
 183{
 184        unsigned long flags;
 185
 186        spin_lock_irqsave(&ctx->wqh.lock, flags);
 187        eventfd_ctx_do_read(ctx, cnt);
 188        __remove_wait_queue(&ctx->wqh, wait);
 189        if (*cnt != 0 && waitqueue_active(&ctx->wqh))
 190                wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 191        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 192
 193        return *cnt != 0 ? 0 : -EAGAIN;
 194}
 195EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 196
 197static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 198                            loff_t *ppos)
 199{
 200        struct eventfd_ctx *ctx = file->private_data;
 201        ssize_t res;
 202        __u64 ucnt = 0;
 203        DECLARE_WAITQUEUE(wait, current);
 204
 205        if (count < sizeof(ucnt))
 206                return -EINVAL;
 207
 208        spin_lock_irq(&ctx->wqh.lock);
 209        res = -EAGAIN;
 210        if (ctx->count > 0)
 211                res = sizeof(ucnt);
 212        else if (!(file->f_flags & O_NONBLOCK)) {
 213                __add_wait_queue(&ctx->wqh, &wait);
 214                for (;;) {
 215                        set_current_state(TASK_INTERRUPTIBLE);
 216                        if (ctx->count > 0) {
 217                                res = sizeof(ucnt);
 218                                break;
 219                        }
 220                        if (signal_pending(current)) {
 221                                res = -ERESTARTSYS;
 222                                break;
 223                        }
 224                        spin_unlock_irq(&ctx->wqh.lock);
 225                        schedule();
 226                        spin_lock_irq(&ctx->wqh.lock);
 227                }
 228                __remove_wait_queue(&ctx->wqh, &wait);
 229                __set_current_state(TASK_RUNNING);
 230        }
 231        if (likely(res > 0)) {
 232                eventfd_ctx_do_read(ctx, &ucnt);
 233                if (waitqueue_active(&ctx->wqh))
 234                        wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 235        }
 236        spin_unlock_irq(&ctx->wqh.lock);
 237
 238        if (res > 0 && put_user(ucnt, (__u64 __user *)buf))
 239                return -EFAULT;
 240
 241        return res;
 242}
 243
 244static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 245                             loff_t *ppos)
 246{
 247        struct eventfd_ctx *ctx = file->private_data;
 248        ssize_t res;
 249        __u64 ucnt;
 250        DECLARE_WAITQUEUE(wait, current);
 251
 252        if (count < sizeof(ucnt))
 253                return -EINVAL;
 254        if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
 255                return -EFAULT;
 256        if (ucnt == ULLONG_MAX)
 257                return -EINVAL;
 258        spin_lock_irq(&ctx->wqh.lock);
 259        res = -EAGAIN;
 260        if (ULLONG_MAX - ctx->count > ucnt)
 261                res = sizeof(ucnt);
 262        else if (!(file->f_flags & O_NONBLOCK)) {
 263                __add_wait_queue(&ctx->wqh, &wait);
 264                for (res = 0;;) {
 265                        set_current_state(TASK_INTERRUPTIBLE);
 266                        if (ULLONG_MAX - ctx->count > ucnt) {
 267                                res = sizeof(ucnt);
 268                                break;
 269                        }
 270                        if (signal_pending(current)) {
 271                                res = -ERESTARTSYS;
 272                                break;
 273                        }
 274                        spin_unlock_irq(&ctx->wqh.lock);
 275                        schedule();
 276                        spin_lock_irq(&ctx->wqh.lock);
 277                }
 278                __remove_wait_queue(&ctx->wqh, &wait);
 279                __set_current_state(TASK_RUNNING);
 280        }
 281        if (likely(res > 0)) {
 282                ctx->count += ucnt;
 283                if (waitqueue_active(&ctx->wqh))
 284                        wake_up_locked_poll(&ctx->wqh, EPOLLIN);
 285        }
 286        spin_unlock_irq(&ctx->wqh.lock);
 287
 288        return res;
 289}
 290
 291#ifdef CONFIG_PROC_FS
 292static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 293{
 294        struct eventfd_ctx *ctx = f->private_data;
 295
 296        spin_lock_irq(&ctx->wqh.lock);
 297        seq_printf(m, "eventfd-count: %16llx\n",
 298                   (unsigned long long)ctx->count);
 299        spin_unlock_irq(&ctx->wqh.lock);
 300}
 301#endif
 302
 303static const struct file_operations eventfd_fops = {
 304#ifdef CONFIG_PROC_FS
 305        .show_fdinfo    = eventfd_show_fdinfo,
 306#endif
 307        .release        = eventfd_release,
 308        .poll           = eventfd_poll,
 309        .read           = eventfd_read,
 310        .write          = eventfd_write,
 311        .llseek         = noop_llseek,
 312};
 313
 314/**
 315 * eventfd_fget - Acquire a reference of an eventfd file descriptor.
 316 * @fd: [in] Eventfd file descriptor.
 317 *
 318 * Returns a pointer to the eventfd file structure in case of success, or the
 319 * following error pointer:
 320 *
 321 * -EBADF    : Invalid @fd file descriptor.
 322 * -EINVAL   : The @fd file descriptor is not an eventfd file.
 323 */
 324struct file *eventfd_fget(int fd)
 325{
 326        struct file *file;
 327
 328        file = fget(fd);
 329        if (!file)
 330                return ERR_PTR(-EBADF);
 331        if (file->f_op != &eventfd_fops) {
 332                fput(file);
 333                return ERR_PTR(-EINVAL);
 334        }
 335
 336        return file;
 337}
 338EXPORT_SYMBOL_GPL(eventfd_fget);
 339
 340/**
 341 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
 342 * @fd: [in] Eventfd file descriptor.
 343 *
 344 * Returns a pointer to the internal eventfd context, otherwise the error
 345 * pointers returned by the following functions:
 346 *
 347 * eventfd_fget
 348 */
 349struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 350{
 351        struct eventfd_ctx *ctx;
 352        struct fd f = fdget(fd);
 353        if (!f.file)
 354                return ERR_PTR(-EBADF);
 355        ctx = eventfd_ctx_fileget(f.file);
 356        fdput(f);
 357        return ctx;
 358}
 359EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
 360
 361/**
 362 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
 363 * @file: [in] Eventfd file pointer.
 364 *
 365 * Returns a pointer to the internal eventfd context, otherwise the error
 366 * pointer:
 367 *
 368 * -EINVAL   : The @fd file descriptor is not an eventfd file.
 369 */
 370struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
 371{
 372        struct eventfd_ctx *ctx;
 373
 374        if (file->f_op != &eventfd_fops)
 375                return ERR_PTR(-EINVAL);
 376
 377        ctx = file->private_data;
 378        kref_get(&ctx->kref);
 379        return ctx;
 380}
 381EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 382
 383static int do_eventfd(unsigned int count, int flags)
 384{
 385        struct eventfd_ctx *ctx;
 386        int fd;
 387
 388        /* Check the EFD_* constants for consistency.  */
 389        BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 390        BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 391
 392        if (flags & ~EFD_FLAGS_SET)
 393                return -EINVAL;
 394
 395        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 396        if (!ctx)
 397                return -ENOMEM;
 398
 399        kref_init(&ctx->kref);
 400        init_waitqueue_head(&ctx->wqh);
 401        ctx->count = count;
 402        ctx->flags = flags;
 403
 404        fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
 405                              O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
 406        if (fd < 0)
 407                eventfd_free_ctx(ctx);
 408
 409        return fd;
 410}
 411
 412SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 413{
 414        return do_eventfd(count, flags);
 415}
 416
 417SYSCALL_DEFINE1(eventfd, unsigned int, count)
 418{
 419        return do_eventfd(count, 0);
 420}
 421
 422