summaryrefslogtreecommitdiff
path: root/fs/eventfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/eventfd.c')
-rw-r--r--fs/eventfd.c354
1 files changed, 161 insertions, 193 deletions
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 35470d9b96e6..3219e0d596fe 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* fs/eventfd.c
*
@@ -9,7 +10,7 @@
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/list.h>
@@ -21,6 +22,10 @@
#include <linux/eventfd.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/idr.h>
+#include <linux/uio.h>
+
+static DEFINE_IDA(eventfd_ida);
struct eventfd_ctx {
struct kref kref;
@@ -28,47 +33,56 @@ struct eventfd_ctx {
/*
* Every time that a write(2) is performed on an eventfd, the
* value of the __u64 being written is added to "count" and a
- * wakeup is performed on "wqh". A read(2) will return the "count"
- * value to userspace, and will reset "count" to zero. The kernel
- * side eventfd_signal() also, adds to the "count" counter and
- * issue a wakeup.
+ * wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not
+ * specified, a read(2) will return the "count" value to userspace,
+ * and will reset "count" to zero. The kernel side eventfd_signal()
+ * also, adds to the "count" counter and issue a wakeup.
*/
__u64 count;
unsigned int flags;
+ int id;
};
/**
- * eventfd_signal - Adds @n to the eventfd counter.
+ * eventfd_signal_mask - Increment the event counter
* @ctx: [in] Pointer to the eventfd context.
- * @n: [in] Value of the counter to be added to the eventfd internal counter.
- * The value cannot be negative.
+ * @mask: [in] poll mask
*
* This function is supposed to be called by the kernel in paths that do not
* allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returining a POLLERR
+ * value, and we signal this as overflow condition by returning a EPOLLERR
* to poll(2).
- *
- * Returns the amount by which the counter was incrememnted. This will be less
- * than @n if the counter has overflowed.
*/
-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
{
unsigned long flags;
+ /*
+ * Deadlock or stack overflow issues can happen if we recurse here
+ * through waitqueue wakeup handlers. If the caller users potentially
+ * nested waitqueues with custom wakeup handlers, then it should
+ * check eventfd_signal_allowed() before calling this function. If
+ * it returns false, the eventfd_signal() call should be deferred to a
+ * safe context.
+ */
+ if (WARN_ON_ONCE(current->in_eventfd))
+ return;
+
spin_lock_irqsave(&ctx->wqh.lock, flags);
- if (ULLONG_MAX - ctx->count < n)
- n = ULLONG_MAX - ctx->count;
- ctx->count += n;
+ current->in_eventfd = 1;
+ if (ctx->count < ULLONG_MAX)
+ ctx->count++;
if (waitqueue_active(&ctx->wqh))
- wake_up_locked_poll(&ctx->wqh, POLLIN);
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
+ current->in_eventfd = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
-
- return n;
}
-EXPORT_SYMBOL_GPL(eventfd_signal);
+EXPORT_SYMBOL_GPL(eventfd_signal_mask);
static void eventfd_free_ctx(struct eventfd_ctx *ctx)
{
+ if (ctx->id >= 0)
+ ida_free(&eventfd_ida, ctx->id);
kfree(ctx);
}
@@ -80,24 +94,11 @@ static void eventfd_free(struct kref *kref)
}
/**
- * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
- * @ctx: [in] Pointer to the eventfd context.
- *
- * Returns: In case of success, returns a pointer to the eventfd context.
- */
-struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
-{
- kref_get(&ctx->kref);
- return ctx;
-}
-EXPORT_SYMBOL_GPL(eventfd_ctx_get);
-
-/**
* eventfd_ctx_put - Releases a reference to the internal eventfd context.
* @ctx: [in] Pointer to eventfd context.
*
* The eventfd context reference must have been previously acquired either
- * with eventfd_ctx_get() or eventfd_ctx_fdget().
+ * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
*/
void eventfd_ctx_put(struct eventfd_ctx *ctx)
{
@@ -109,36 +110,77 @@ static int eventfd_release(struct inode *inode, struct file *file)
{
struct eventfd_ctx *ctx = file->private_data;
- wake_up_poll(&ctx->wqh, POLLHUP);
+ wake_up_poll(&ctx->wqh, EPOLLHUP);
eventfd_ctx_put(ctx);
return 0;
}
-static unsigned int eventfd_poll(struct file *file, poll_table *wait)
+static __poll_t eventfd_poll(struct file *file, poll_table *wait)
{
struct eventfd_ctx *ctx = file->private_data;
- unsigned int events = 0;
- unsigned long flags;
+ __poll_t events = 0;
+ u64 count;
poll_wait(file, &ctx->wqh, wait);
- spin_lock_irqsave(&ctx->wqh.lock, flags);
- if (ctx->count > 0)
- events |= POLLIN;
- if (ctx->count == ULLONG_MAX)
- events |= POLLERR;
- if (ULLONG_MAX - 1 > ctx->count)
- events |= POLLOUT;
- spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+ /*
+ * All writes to ctx->count occur within ctx->wqh.lock. This read
+ * can be done outside ctx->wqh.lock because we know that poll_wait
+ * takes that lock (through add_wait_queue) if our caller will sleep.
+ *
+ * The read _can_ therefore seep into add_wait_queue's critical
+ * section, but cannot move above it! add_wait_queue's spin_lock acts
+ * as an acquire barrier and ensures that the read be ordered properly
+ * against the writes. The following CAN happen and is safe:
+ *
+ * poll write
+ * ----------------- ------------
+ * lock ctx->wqh.lock (in poll_wait)
+ * count = ctx->count
+ * __add_wait_queue
+ * unlock ctx->wqh.lock
+ * lock ctx->qwh.lock
+ * ctx->count += n
+ * if (waitqueue_active)
+ * wake_up_locked_poll
+ * unlock ctx->qwh.lock
+ * eventfd_poll returns 0
+ *
+ * but the following, which would miss a wakeup, cannot happen:
+ *
+ * poll write
+ * ----------------- ------------
+ * count = ctx->count (INVALID!)
+ * lock ctx->qwh.lock
+ * ctx->count += n
+ * **waitqueue_active is false**
+ * **no wake_up_locked_poll!**
+ * unlock ctx->qwh.lock
+ * lock ctx->wqh.lock (in poll_wait)
+ * __add_wait_queue
+ * unlock ctx->wqh.lock
+ * eventfd_poll returns 0
+ */
+ count = READ_ONCE(ctx->count);
+
+ if (count > 0)
+ events |= EPOLLIN;
+ if (count == ULLONG_MAX)
+ events |= EPOLLERR;
+ if (ULLONG_MAX - 1 > count)
+ events |= EPOLLOUT;
return events;
}
-static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
{
- *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+ lockdep_assert_held(&ctx->wqh.lock);
+
+ *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count;
ctx->count -= *cnt;
}
+EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
/**
* eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
@@ -153,7 +195,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
* This is used to atomically remove a wait queue entry from the eventfd wait
* queue head, and read/reset the counter value.
*/
-int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
__u64 *cnt)
{
unsigned long flags;
@@ -162,81 +204,44 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
eventfd_ctx_do_read(ctx, cnt);
__remove_wait_queue(&ctx->wqh, wait);
if (*cnt != 0 && waitqueue_active(&ctx->wqh))
- wake_up_locked_poll(&ctx->wqh, POLLOUT);
+ wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return *cnt != 0 ? 0 : -EAGAIN;
}
EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
-/**
- * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
- * @ctx: [in] Pointer to eventfd context.
- * @no_wait: [in] Different from zero if the operation should not block.
- * @cnt: [out] Pointer to the 64-bit counter value.
- *
- * Returns %0 if successful, or the following error codes:
- *
- * -EAGAIN : The operation would have blocked but @no_wait was non-zero.
- * -ERESTARTSYS : A signal interrupted the wait operation.
- *
- * If @no_wait is zero, the function might sleep until the eventfd internal
- * counter becomes greater than zero.
- */
-ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
+static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
{
- ssize_t res;
- DECLARE_WAITQUEUE(wait, current);
+ struct file *file = iocb->ki_filp;
+ struct eventfd_ctx *ctx = file->private_data;
+ __u64 ucnt = 0;
+ if (iov_iter_count(to) < sizeof(ucnt))
+ return -EINVAL;
spin_lock_irq(&ctx->wqh.lock);
- *cnt = 0;
- res = -EAGAIN;
- if (ctx->count > 0)
- res = 0;
- else if (!no_wait) {
- __add_wait_queue(&ctx->wqh, &wait);
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (ctx->count > 0) {
- res = 0;
- break;
- }
- if (signal_pending(current)) {
- res = -ERESTARTSYS;
- break;
- }
+ if (!ctx->count) {
+ if ((file->f_flags & O_NONBLOCK) ||
+ (iocb->ki_flags & IOCB_NOWAIT)) {
spin_unlock_irq(&ctx->wqh.lock);
- schedule();
- spin_lock_irq(&ctx->wqh.lock);
+ return -EAGAIN;
+ }
+
+ if (wait_event_interruptible_locked_irq(ctx->wqh, ctx->count)) {
+ spin_unlock_irq(&ctx->wqh.lock);
+ return -ERESTARTSYS;
}
- __remove_wait_queue(&ctx->wqh, &wait);
- __set_current_state(TASK_RUNNING);
- }
- if (likely(res == 0)) {
- eventfd_ctx_do_read(ctx, cnt);
- if (waitqueue_active(&ctx->wqh))
- wake_up_locked_poll(&ctx->wqh, POLLOUT);
}
+ eventfd_ctx_do_read(ctx, &ucnt);
+ current->in_eventfd = 1;
+ if (waitqueue_active(&ctx->wqh))
+ wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
+ current->in_eventfd = 0;
spin_unlock_irq(&ctx->wqh.lock);
+ if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
+ return -EFAULT;
- return res;
-}
-EXPORT_SYMBOL_GPL(eventfd_ctx_read);
-
-static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct eventfd_ctx *ctx = file->private_data;
- ssize_t res;
- __u64 cnt;
-
- if (count < sizeof(cnt))
- return -EINVAL;
- res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
- if (res < 0)
- return res;
-
- return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
+ return sizeof(ucnt);
}
static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
@@ -245,9 +250,8 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
struct eventfd_ctx *ctx = file->private_data;
ssize_t res;
__u64 ucnt;
- DECLARE_WAITQUEUE(wait, current);
- if (count < sizeof(ucnt))
+ if (count != sizeof(ucnt))
return -EINVAL;
if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
return -EFAULT;
@@ -258,28 +262,17 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
if (ULLONG_MAX - ctx->count > ucnt)
res = sizeof(ucnt);
else if (!(file->f_flags & O_NONBLOCK)) {
- __add_wait_queue(&ctx->wqh, &wait);
- for (res = 0;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (ULLONG_MAX - ctx->count > ucnt) {
- res = sizeof(ucnt);
- break;
- }
- if (signal_pending(current)) {
- res = -ERESTARTSYS;
- break;
- }
- spin_unlock_irq(&ctx->wqh.lock);
- schedule();
- spin_lock_irq(&ctx->wqh.lock);
- }
- __remove_wait_queue(&ctx->wqh, &wait);
- __set_current_state(TASK_RUNNING);
+ res = wait_event_interruptible_locked_irq(ctx->wqh,
+ ULLONG_MAX - ctx->count > ucnt);
+ if (!res)
+ res = sizeof(ucnt);
}
if (likely(res > 0)) {
ctx->count += ucnt;
+ current->in_eventfd = 1;
if (waitqueue_active(&ctx->wqh))
- wake_up_locked_poll(&ctx->wqh, POLLIN);
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ current->in_eventfd = 0;
}
spin_unlock_irq(&ctx->wqh.lock);
@@ -287,17 +280,22 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
}
#ifdef CONFIG_PROC_FS
-static int eventfd_show_fdinfo(struct seq_file *m, struct file *f)
+static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct eventfd_ctx *ctx = f->private_data;
- int ret;
+ __u64 cnt;
spin_lock_irq(&ctx->wqh.lock);
- ret = seq_printf(m, "eventfd-count: %16llx\n",
- (unsigned long long)ctx->count);
+ cnt = ctx->count;
spin_unlock_irq(&ctx->wqh.lock);
- return ret;
+ seq_printf(m,
+ "eventfd-count: %16llx\n"
+ "eventfd-id: %d\n"
+ "eventfd-semaphore: %d\n",
+ cnt,
+ ctx->id,
+ !!(ctx->flags & EFD_SEMAPHORE));
}
#endif
@@ -307,7 +305,7 @@ static const struct file_operations eventfd_fops = {
#endif
.release = eventfd_release,
.poll = eventfd_poll,
- .read = eventfd_read,
+ .read_iter = eventfd_read,
.write = eventfd_write,
.llseek = noop_llseek,
};
@@ -349,16 +347,10 @@ EXPORT_SYMBOL_GPL(eventfd_fget);
*/
struct eventfd_ctx *eventfd_ctx_fdget(int fd)
{
- struct file *file;
- struct eventfd_ctx *ctx;
-
- file = eventfd_fget(fd);
- if (IS_ERR(file))
- return (struct eventfd_ctx *) file;
- ctx = eventfd_ctx_get(file->private_data);
- fput(file);
-
- return ctx;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ return eventfd_ctx_fileget(fd_file(f));
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
@@ -373,83 +365,59 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
*/
struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
{
+ struct eventfd_ctx *ctx;
+
if (file->f_op != &eventfd_fops)
return ERR_PTR(-EINVAL);
- return eventfd_ctx_get(file->private_data);
+ ctx = file->private_data;
+ kref_get(&ctx->kref);
+ return ctx;
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
-/**
- * eventfd_file_create - Creates an eventfd file pointer.
- * @count: Initial eventfd counter value.
- * @flags: Flags for the eventfd file.
- *
- * This function creates an eventfd file pointer, w/out installing it into
- * the fd table. This is useful when the eventfd file is used during the
- * initialization of data structures that require extra setup after the eventfd
- * creation. So the eventfd creation is split into the file pointer creation
- * phase, and the file descriptor installation phase.
- * In this way races with userspace closing the newly installed file descriptor
- * can be avoided.
- * Returns an eventfd file pointer, or a proper error pointer.
- */
-struct file *eventfd_file_create(unsigned int count, int flags)
+static int do_eventfd(unsigned int count, int flags)
{
- struct file *file;
- struct eventfd_ctx *ctx;
+ struct eventfd_ctx *ctx __free(kfree) = NULL;
/* Check the EFD_* constants for consistency. */
BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+ BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0));
if (flags & ~EFD_FLAGS_SET)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
kref_init(&ctx->kref);
init_waitqueue_head(&ctx->wqh);
ctx->count = count;
ctx->flags = flags;
- file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
- O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
- if (IS_ERR(file))
- eventfd_free_ctx(ctx);
+ flags &= EFD_SHARED_FCNTL_FLAGS;
+ flags |= O_RDWR;
- return file;
+ FD_PREPARE(fdf, flags,
+ anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, ctx,
+ flags, FMODE_NOWAIT));
+ if (fdf.err)
+ return fdf.err;
+
+ ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL);
+ retain_and_null_ptr(ctx);
+ return fd_publish(fdf);
}
SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
{
- int fd, error;
- struct file *file;
-
- error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
- if (error < 0)
- return error;
- fd = error;
-
- file = eventfd_file_create(count, flags);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_put_unused_fd;
- }
- fd_install(fd, file);
-
- return fd;
-
-err_put_unused_fd:
- put_unused_fd(fd);
-
- return error;
+ return do_eventfd(count, flags);
}
SYSCALL_DEFINE1(eventfd, unsigned int, count)
{
- return sys_eventfd2(count, 0);
+ return do_eventfd(count, 0);
}