linux/fs/eventfd.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  fs/eventfd.c
   4 *
   5 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
   6 *
   7 */
   8
   9#include <linux/file.h>
  10#include <linux/poll.h>
  11#include <linux/init.h>
  12#include <linux/fs.h>
  13#include <linux/sched/signal.h>
  14#include <linux/kernel.h>
  15#include <linux/slab.h>
  16#include <linux/list.h>
  17#include <linux/spinlock.h>
  18#include <linux/anon_inodes.h>
  19#include <linux/syscalls.h>
  20#include <linux/export.h>
  21#include <linux/kref.h>
  22#include <linux/eventfd.h>
  23#include <linux/proc_fs.h>
  24#include <linux/seq_file.h>
  25#include <linux/idr.h>
  26#include <linux/uio.h>
  27
  28static DEFINE_IDA(eventfd_ida);
  29
  30struct eventfd_ctx {
  31        struct kref kref;
  32        wait_queue_head_t wqh;
  33        /*
  34         * Every time that a write(2) is performed on an eventfd, the
  35         * value of the __u64 being written is added to "count" and a
  36         * wakeup is performed on "wqh". A read(2) will return the "count"
  37         * value to userspace, and will reset "count" to zero. The kernel
  38         * side eventfd_signal() also, adds to the "count" counter and
  39         * issue a wakeup.
  40         */
  41        __u64 count;
  42        unsigned int flags;
  43        int id;
  44};
  45
  46/**
  47 * eventfd_signal - Adds @n to the eventfd counter.
  48 * @ctx: [in] Pointer to the eventfd context.
  49 * @n: [in] Value of the counter to be added to the eventfd internal counter.
  50 *          The value cannot be negative.
  51 *
  52 * This function is supposed to be called by the kernel in paths that do not
  53 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
  54 * value, and we signal this as overflow condition by returning a EPOLLERR
  55 * to poll(2).
  56 *
  57 * Returns the amount by which the counter was incremented.  This will be less
  58 * than @n if the counter has overflowed.
  59 */
  60__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
  61{
  62        unsigned long flags;
  63
  64        /*
  65         * Deadlock or stack overflow issues can happen if we recurse here
  66         * through waitqueue wakeup handlers. If the caller users potentially
  67         * nested waitqueues with custom wakeup handlers, then it should
  68         * check eventfd_signal_allowed() before calling this function. If
  69         * it returns false, the eventfd_signal() call should be deferred to a
  70         * safe context.
  71         */
  72        if (WARN_ON_ONCE(current->in_eventfd_signal))
  73                return 0;
  74
  75        spin_lock_irqsave(&ctx->wqh.lock, flags);
  76        current->in_eventfd_signal = 1;
  77        if (ULLONG_MAX - ctx->count < n)
  78                n = ULLONG_MAX - ctx->count;
  79        ctx->count += n;
  80        if (waitqueue_active(&ctx->wqh))
  81                wake_up_locked_poll(&ctx->wqh, EPOLLIN);
  82        current->in_eventfd_signal = 0;
  83        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  84
  85        return n;
  86}
  87EXPORT_SYMBOL_GPL(eventfd_signal);
  88
  89static void eventfd_free_ctx(struct eventfd_ctx *ctx)
  90{
  91        if (ctx->id >= 0)
  92                ida_simple_remove(&eventfd_ida, ctx->id);
  93        kfree(ctx);
  94}
  95
  96static void eventfd_free(struct kref *kref)
  97{
  98        struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
  99
 100        eventfd_free_ctx(ctx);
 101}
 102
 103/**
 104 * eventfd_ctx_put - Releases a reference to the internal eventfd context.
 105 * @ctx: [in] Pointer to eventfd context.
 106 *
 107 * The eventfd context reference must have been previously acquired either
 108 * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
 109 */
 110void eventfd_ctx_put(struct eventfd_ctx *ctx)
 111{
 112        kref_put(&ctx->kref, eventfd_free);
 113}
 114EXPORT_SYMBOL_GPL(eventfd_ctx_put);
 115
 116static int eventfd_release(struct inode *inode, struct file *file)
 117{
 118        struct eventfd_ctx *ctx = file->private_data;
 119
 120        wake_up_poll(&ctx->wqh, EPOLLHUP);
 121        eventfd_ctx_put(ctx);
 122        return 0;
 123}
 124
 125static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 126{
 127        struct eventfd_ctx *ctx = file->private_data;
 128        __poll_t events = 0;
 129        u64 count;
 130
 131        poll_wait(file, &ctx->wqh, wait);
 132
 133        /*
 134         * All writes to ctx->count occur within ctx->wqh.lock.  This read
 135         * can be done outside ctx->wqh.lock because we know that poll_wait
 136         * takes that lock (through add_wait_queue) if our caller will sleep.
 137         *
 138         * The read _can_ therefore seep into add_wait_queue's critical
 139         * section, but cannot move above it!  add_wait_queue's spin_lock acts
 140         * as an acquire barrier and ensures that the read be ordered properly
 141         * against the writes.  The following CAN happen and is safe:
 142         *
 143         *     poll                               write
 144         *     -----------------                  ------------
 145         *     lock ctx->wqh.lock (in poll_wait)
 146         *     count = ctx->count
 147         *     __add_wait_queue
 148         *     unlock ctx->wqh.lock
 149         *                                        lock ctx->qwh.lock
 150         *                                        ctx->count += n
 151         *                                        if (waitqueue_active)
 152         *                                          wake_up_locked_poll
 153         *                                        unlock ctx->qwh.lock
 154         *     eventfd_poll returns 0
 155         *
 156         * but the following, which would miss a wakeup, cannot happen:
 157         *
 158         *     poll                               write
 159         *     -----------------                  ------------
 160         *     count = ctx->count (INVALID!)
 161         *                                        lock ctx->qwh.lock
 162         *                                        ctx->count += n
 163         *                                        **waitqueue_active is false**
 164         *                                        **no wake_up_locked_poll!**
 165         *                                        unlock ctx->qwh.lock
 166         *     lock ctx->wqh.lock (in poll_wait)
 167         *     __add_wait_queue
 168         *     unlock ctx->wqh.lock
 169         *     eventfd_poll returns 0
 170         */
 171        count = READ_ONCE(ctx->count);
 172
 173        if (count > 0)
 174                events |= EPOLLIN;
 175        if (count == ULLONG_MAX)
 176                events |= EPOLLERR;
 177        if (ULLONG_MAX - 1 > count)
 178                events |= EPOLLOUT;
 179
 180        return events;
 181}
 182
 183void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 184{
 185        lockdep_assert_held(&ctx->wqh.lock);
 186
 187        *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 188        ctx->count -= *cnt;
 189}
 190EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
 191
 192/**
 193 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 194 * @ctx: [in] Pointer to eventfd context.
 195 * @wait: [in] Wait queue to be removed.
 196 * @cnt: [out] Pointer to the 64-bit counter value.
 197 *
 198 * Returns %0 if successful, or the following error codes:
 199 *
 200 * -EAGAIN      : The operation would have blocked.
 201 *
 202 * This is used to atomically remove a wait queue entry from the eventfd wait
 203 * queue head, and read/reset the counter value.
 204 */
 205int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 206                                  __u64 *cnt)
 207{
 208        unsigned long flags;
 209
 210        spin_lock_irqsave(&ctx->wqh.lock, flags);
 211        eventfd_ctx_do_read(ctx, cnt);
 212        __remove_wait_queue(&ctx->wqh, wait);
 213        if (*cnt != 0 && waitqueue_active(&ctx->wqh))
 214                wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 215        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 216
 217        return *cnt != 0 ? 0 : -EAGAIN;
 218}
 219EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 220
 221static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
 222{
 223        struct file *file = iocb->ki_filp;
 224        struct eventfd_ctx *ctx = file->private_data;
 225        __u64 ucnt = 0;
 226        DECLARE_WAITQUEUE(wait, current);
 227
 228        if (iov_iter_count(to) < sizeof(ucnt))
 229                return -EINVAL;
 230        spin_lock_irq(&ctx->wqh.lock);
 231        if (!ctx->count) {
 232                if ((file->f_flags & O_NONBLOCK) ||
 233                    (iocb->ki_flags & IOCB_NOWAIT)) {
 234                        spin_unlock_irq(&ctx->wqh.lock);
 235                        return -EAGAIN;
 236                }
 237                __add_wait_queue(&ctx->wqh, &wait);
 238                for (;;) {
 239                        set_current_state(TASK_INTERRUPTIBLE);
 240                        if (ctx->count)
 241                                break;
 242                        if (signal_pending(current)) {
 243                                __remove_wait_queue(&ctx->wqh, &wait);
 244                                __set_current_state(TASK_RUNNING);
 245                                spin_unlock_irq(&ctx->wqh.lock);
 246                                return -ERESTARTSYS;
 247                        }
 248                        spin_unlock_irq(&ctx->wqh.lock);
 249                        schedule();
 250                        spin_lock_irq(&ctx->wqh.lock);
 251                }
 252                __remove_wait_queue(&ctx->wqh, &wait);
 253                __set_current_state(TASK_RUNNING);
 254        }
 255        eventfd_ctx_do_read(ctx, &ucnt);
 256        if (waitqueue_active(&ctx->wqh))
 257                wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 258        spin_unlock_irq(&ctx->wqh.lock);
 259        if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
 260                return -EFAULT;
 261
 262        return sizeof(ucnt);
 263}
 264
 265static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 266                             loff_t *ppos)
 267{
 268        struct eventfd_ctx *ctx = file->private_data;
 269        ssize_t res;
 270        __u64 ucnt;
 271        DECLARE_WAITQUEUE(wait, current);
 272
 273        if (count < sizeof(ucnt))
 274                return -EINVAL;
 275        if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
 276                return -EFAULT;
 277        if (ucnt == ULLONG_MAX)
 278                return -EINVAL;
 279        spin_lock_irq(&ctx->wqh.lock);
 280        res = -EAGAIN;
 281        if (ULLONG_MAX - ctx->count > ucnt)
 282                res = sizeof(ucnt);
 283        else if (!(file->f_flags & O_NONBLOCK)) {
 284                __add_wait_queue(&ctx->wqh, &wait);
 285                for (res = 0;;) {
 286                        set_current_state(TASK_INTERRUPTIBLE);
 287                        if (ULLONG_MAX - ctx->count > ucnt) {
 288                                res = sizeof(ucnt);
 289                                break;
 290                        }
 291                        if (signal_pending(current)) {
 292                                res = -ERESTARTSYS;
 293                                break;
 294                        }
 295                        spin_unlock_irq(&ctx->wqh.lock);
 296                        schedule();
 297                        spin_lock_irq(&ctx->wqh.lock);
 298                }
 299                __remove_wait_queue(&ctx->wqh, &wait);
 300                __set_current_state(TASK_RUNNING);
 301        }
 302        if (likely(res > 0)) {
 303                ctx->count += ucnt;
 304                if (waitqueue_active(&ctx->wqh))
 305                        wake_up_locked_poll(&ctx->wqh, EPOLLIN);
 306        }
 307        spin_unlock_irq(&ctx->wqh.lock);
 308
 309        return res;
 310}
 311
 312#ifdef CONFIG_PROC_FS
 313static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 314{
 315        struct eventfd_ctx *ctx = f->private_data;
 316
 317        spin_lock_irq(&ctx->wqh.lock);
 318        seq_printf(m, "eventfd-count: %16llx\n",
 319                   (unsigned long long)ctx->count);
 320        spin_unlock_irq(&ctx->wqh.lock);
 321        seq_printf(m, "eventfd-id: %d\n", ctx->id);
 322}
 323#endif
 324
 325static const struct file_operations eventfd_fops = {
 326#ifdef CONFIG_PROC_FS
 327        .show_fdinfo    = eventfd_show_fdinfo,
 328#endif
 329        .release        = eventfd_release,
 330        .poll           = eventfd_poll,
 331        .read_iter      = eventfd_read,
 332        .write          = eventfd_write,
 333        .llseek         = noop_llseek,
 334};
 335
 336/**
 337 * eventfd_fget - Acquire a reference of an eventfd file descriptor.
 338 * @fd: [in] Eventfd file descriptor.
 339 *
 340 * Returns a pointer to the eventfd file structure in case of success, or the
 341 * following error pointer:
 342 *
 343 * -EBADF    : Invalid @fd file descriptor.
 344 * -EINVAL   : The @fd file descriptor is not an eventfd file.
 345 */
 346struct file *eventfd_fget(int fd)
 347{
 348        struct file *file;
 349
 350        file = fget(fd);
 351        if (!file)
 352                return ERR_PTR(-EBADF);
 353        if (file->f_op != &eventfd_fops) {
 354                fput(file);
 355                return ERR_PTR(-EINVAL);
 356        }
 357
 358        return file;
 359}
 360EXPORT_SYMBOL_GPL(eventfd_fget);
 361
 362/**
 363 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
 364 * @fd: [in] Eventfd file descriptor.
 365 *
 366 * Returns a pointer to the internal eventfd context, otherwise the error
 367 * pointers returned by the following functions:
 368 *
 369 * eventfd_fget
 370 */
 371struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 372{
 373        struct eventfd_ctx *ctx;
 374        struct fd f = fdget(fd);
 375        if (!f.file)
 376                return ERR_PTR(-EBADF);
 377        ctx = eventfd_ctx_fileget(f.file);
 378        fdput(f);
 379        return ctx;
 380}
 381EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
 382
 383/**
 384 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
 385 * @file: [in] Eventfd file pointer.
 386 *
 387 * Returns a pointer to the internal eventfd context, otherwise the error
 388 * pointer:
 389 *
 390 * -EINVAL   : The @fd file descriptor is not an eventfd file.
 391 */
 392struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
 393{
 394        struct eventfd_ctx *ctx;
 395
 396        if (file->f_op != &eventfd_fops)
 397                return ERR_PTR(-EINVAL);
 398
 399        ctx = file->private_data;
 400        kref_get(&ctx->kref);
 401        return ctx;
 402}
 403EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 404
 405static int do_eventfd(unsigned int count, int flags)
 406{
 407        struct eventfd_ctx *ctx;
 408        struct file *file;
 409        int fd;
 410
 411        /* Check the EFD_* constants for consistency.  */
 412        BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 413        BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 414
 415        if (flags & ~EFD_FLAGS_SET)
 416                return -EINVAL;
 417
 418        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 419        if (!ctx)
 420                return -ENOMEM;
 421
 422        kref_init(&ctx->kref);
 423        init_waitqueue_head(&ctx->wqh);
 424        ctx->count = count;
 425        ctx->flags = flags;
 426        ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
 427
 428        flags &= EFD_SHARED_FCNTL_FLAGS;
 429        flags |= O_RDWR;
 430        fd = get_unused_fd_flags(flags);
 431        if (fd < 0)
 432                goto err;
 433
 434        file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
 435        if (IS_ERR(file)) {
 436                put_unused_fd(fd);
 437                fd = PTR_ERR(file);
 438                goto err;
 439        }
 440
 441        file->f_mode |= FMODE_NOWAIT;
 442        fd_install(fd, file);
 443        return fd;
 444err:
 445        eventfd_free_ctx(ctx);
 446        return fd;
 447}
 448
 449SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 450{
 451        return do_eventfd(count, flags);
 452}
 453
 454SYSCALL_DEFINE1(eventfd, unsigned int, count)
 455{
 456        return do_eventfd(count, 0);
 457}
 458
 459