summaryrefslogtreecommitdiff
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c289
1 files changed, 226 insertions, 63 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3534d36a1474..7c0980db77b3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -37,6 +37,7 @@
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
+#include <linux/capability.h>
#include <net/busy_poll.h>
/*
@@ -206,7 +207,7 @@ struct eventpoll {
*/
struct epitem *ovflist;
- /* wakeup_source used when ep_scan_ready_list is running */
+ /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
@@ -227,6 +228,11 @@ struct eventpoll {
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
+ /* busy poll timeout */
+ u32 busy_poll_usecs;
+ /* busy poll packet budget */
+ u16 busy_poll_budget;
+ bool prefer_busy_poll;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -312,7 +318,7 @@ static void unlist_file(struct epitems_head *head)
static long long_zero;
static long long_max = LONG_MAX;
-static struct ctl_table epoll_table[] = {
+static const struct ctl_table epoll_table[] = {
{
.procname = "max_user_watches",
.data = &max_user_watches,
@@ -387,11 +393,43 @@ static inline int ep_events_available(struct eventpoll *ep)
}
#ifdef CONFIG_NET_RX_BUSY_POLL
+/**
+ * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
+ * from the epoll instance ep is preferred, but if it is not set fallback to
+ * the system-wide global via busy_loop_timeout.
+ *
+ * @start_time: The start time used to compute the remaining time until timeout.
+ * @ep: Pointer to the eventpoll context.
+ *
+ * Return: true if the timeout has expired, false otherwise.
+ */
+static bool busy_loop_ep_timeout(unsigned long start_time,
+ struct eventpoll *ep)
+{
+ unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);
+
+ if (bp_usec) {
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = busy_loop_current_time();
+
+ return time_after(now, end_time);
+ } else {
+ return busy_loop_timeout(start_time);
+ }
+}
+
+static bool ep_busy_loop_on(struct eventpoll *ep)
+{
+ return !!READ_ONCE(ep->busy_poll_usecs) ||
+ READ_ONCE(ep->prefer_busy_poll) ||
+ net_busy_loop_on();
+}
+
static bool ep_busy_loop_end(void *p, unsigned long start_time)
{
struct eventpoll *ep = p;
- return ep_events_available(ep) || busy_loop_timeout(start_time);
+ return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
}
/*
@@ -403,10 +441,15 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
+ u16 budget = READ_ONCE(ep->busy_poll_budget);
+ bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
+
+ if (!budget)
+ budget = BUSY_POLL_BUDGET;
- if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
- napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
- BUSY_POLL_BUDGET);
+ if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
+ napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
+ ep, prefer_busy_poll, budget);
if (ep_events_available(ep))
return true;
/*
@@ -414,6 +457,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
* it back in when we have moved a socket with a valid NAPI
* ID onto the ready list.
*/
+ if (prefer_busy_poll)
+ napi_resume_irqs(napi_id);
ep->napi_id = 0;
return false;
}
@@ -425,12 +470,12 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
*/
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
- struct eventpoll *ep;
+ struct eventpoll *ep = epi->ep;
unsigned int napi_id;
struct socket *sock;
struct sock *sk;
- if (!net_busy_loop_on())
+ if (!ep_busy_loop_on(ep))
return;
sock = sock_from_file(epi->ffd.file);
@@ -442,7 +487,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
return;
napi_id = READ_ONCE(sk->sk_napi_id);
- ep = epi->ep;
/* Non-NAPI IDs can be rejected
* or
@@ -455,6 +499,65 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
ep->napi_id = napi_id;
}
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct eventpoll *ep = file->private_data;
+ void __user *uarg = (void __user *)arg;
+ struct epoll_params epoll_params;
+
+ switch (cmd) {
+ case EPIOCSPARAMS:
+ if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
+ return -EFAULT;
+
+ /* pad byte must be zero */
+ if (epoll_params.__pad)
+ return -EINVAL;
+
+ if (epoll_params.busy_poll_usecs > S32_MAX)
+ return -EINVAL;
+
+ if (epoll_params.prefer_busy_poll > 1)
+ return -EINVAL;
+
+ if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
+ !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
+ WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
+ WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
+ return 0;
+ case EPIOCGPARAMS:
+ memset(&epoll_params, 0, sizeof(epoll_params));
+ epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
+ epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
+ epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
+ if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
+ return -EFAULT;
+ return 0;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+static void ep_suspend_napi_irqs(struct eventpoll *ep)
+{
+ unsigned int napi_id = READ_ONCE(ep->napi_id);
+
+ if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll))
+ napi_suspend_irqs(napi_id);
+}
+
+static void ep_resume_napi_irqs(struct eventpoll *ep)
+{
+ unsigned int napi_id = READ_ONCE(ep->napi_id);
+
+ if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll))
+ napi_resume_irqs(napi_id);
+}
+
#else
static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
@@ -466,6 +569,20 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
}
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return -EOPNOTSUPP;
+}
+
+static void ep_suspend_napi_irqs(struct eventpoll *ep)
+{
+}
+
+static void ep_resume_napi_irqs(struct eventpoll *ep)
+{
+}
+
#endif /* CONFIG_NET_RX_BUSY_POLL */
/*
@@ -678,12 +795,6 @@ static void ep_done_scan(struct eventpoll *ep,
write_unlock_irq(&ep->lock);
}
-static void epi_rcu_free(struct rcu_head *head)
-{
- struct epitem *epi = container_of(head, struct epitem, rcu);
- kmem_cache_free(epi_cache, epi);
-}
-
static void ep_get(struct eventpoll *ep)
{
refcount_inc(&ep->refcount);
@@ -703,6 +814,7 @@ static bool ep_refcount_dec_and_test(struct eventpoll *ep)
static void ep_free(struct eventpoll *ep)
{
+ ep_resume_napi_irqs(ep);
mutex_destroy(&ep->mtx);
free_uid(ep->user);
wakeup_source_unregister(ep->ws);
@@ -740,7 +852,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
to_free = NULL;
head = file->f_ep;
if (head->first == &epi->fllink && !epi->fllink.next) {
- file->f_ep = NULL;
+ /* See eventpoll_release() for details. */
+ WRITE_ONCE(file->f_ep, NULL);
if (!is_file_epoll(file)) {
struct epitems_head *v;
v = container_of(head, struct epitems_head, epitems);
@@ -767,7 +880,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
* ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
* use of the rbn field.
*/
- call_rcu(&epi->rcu, epi_rcu_free);
+ kfree_rcu(epi, rcu);
percpu_counter_dec(&ep->user->epoll_watches);
return ep_refcount_dec_and_test(ep);
@@ -825,6 +938,27 @@ static void ep_clear_and_put(struct eventpoll *ep)
ep_free(ep);
}
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret;
+
+ if (!is_file_epoll(file))
+ return -EINVAL;
+
+ switch (cmd) {
+ case EPIOCSPARAMS:
+ case EPIOCGPARAMS:
+ ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
struct eventpoll *ep = file->private_data;
@@ -876,6 +1010,34 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
}
/*
+ * The ffd.file pointer may be in the process of being torn down due to
+ * being closed, but we may not have finished eventpoll_release() yet.
+ *
+ * Normally, even with the atomic_long_inc_not_zero, the file may have
+ * been free'd and then gotten re-allocated to something else (since
+ * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
+ *
+ * But for epoll, users hold the ep->mtx mutex, and as such any file in
+ * the process of being free'd will block in eventpoll_release_file()
+ * and thus the underlying file allocation will not be free'd, and the
+ * file re-use cannot happen.
+ *
+ * For the same reason we can avoid a rcu_read_lock() around the
+ * operation - 'ffd.file' cannot go away even if the refcount has
+ * reached zero (but we must still not call out to ->poll() functions
+ * etc).
+ */
+static struct file *epi_fget(const struct epitem *epi)
+{
+ struct file *file;
+
+ file = epi->ffd.file;
+ if (!file_ref_get(&file->f_ref))
+ file = NULL;
+ return file;
+}
+
+/*
* Differs from ep_eventpoll_poll() in that internal callers already have
* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
* is correctly annotated.
@@ -883,14 +1045,22 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
int depth)
{
- struct file *file = epi->ffd.file;
+ struct file *file = epi_fget(epi);
__poll_t res;
+ /*
+ * We could return EPOLLERR | EPOLLHUP or something, but let's
+ * treat this more as "file doesn't exist, poll didn't happen".
+ */
+ if (!file)
+ return 0;
+
pt->_key = epi->event.events;
if (!is_file_epoll(file))
res = vfs_poll(file, pt);
else
res = __ep_eventpoll_poll(file, pt, depth);
+ fput(file);
return res & epi->event.events;
}
@@ -931,6 +1101,8 @@ static const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_release,
.poll = ep_eventpoll_poll,
.llseek = noop_llseek,
+ .unlocked_ioctl = ep_eventpoll_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
/*
@@ -1153,7 +1325,7 @@ static inline bool chain_epi_lockless(struct epitem *epi)
* This callback takes a read lock in order not to contend with concurrent
* events from another file descriptor, thus all modifications to ->rdllist
* or ->ovflist are lockless. Read lock is paired with the write lock from
- * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * ep_start/done_scan(), which stops all list modifications and guarantees
* that lists state is seen correctly.
*
* Another thing worth to mention is that ep_poll_callback() can be called
@@ -1230,7 +1402,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up(&ep->wq);
+ if (sync)
+ wake_up_sync(&ep->wq);
+ else
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
@@ -1461,7 +1636,8 @@ allocate:
spin_unlock(&file->f_lock);
goto allocate;
}
- file->f_ep = head;
+ /* See eventpoll_release() for details. */
+ WRITE_ONCE(file->f_ep, head);
to_free = NULL;
}
hlist_add_head_rcu(&epi->fllink, file->f_ep);
@@ -1751,7 +1927,7 @@ static int ep_send_events(struct eventpoll *ep,
* availability. At this point, no one can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
- * ep_scan_ready_list() holding "mtx" and the
+ * ep_send_events() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1861,8 +2037,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* trying again in search of more luck.
*/
res = ep_send_events(ep, events, maxevents);
- if (res)
+ if (res) {
+ if (res > 0)
+ ep_suspend_napi_irqs(ep);
return res;
+ }
}
if (timed_out)
@@ -1904,7 +2083,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
__set_current_state(TASK_INTERRUPTIBLE);
/*
- * Do the final check under the lock. ep_scan_ready_list()
+ * Do the final check under the lock. ep_start/done_scan()
* plays with two lists (->rdllist and ->ovflist) and there
* is always a race when both lists are empty for short
* period of time although events are pending, so lock is
@@ -2112,25 +2291,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
{
int error;
int full_check = 0;
- struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct eventpoll *tep = NULL;
- error = -EBADF;
- f = fdget(epfd);
- if (!f.file)
- goto error_return;
+ CLASS(fd, f)(epfd);
+ if (fd_empty(f))
+ return -EBADF;
/* Get the "struct file *" for the target file */
- tf = fdget(fd);
- if (!tf.file)
- goto error_fput;
+ CLASS(fd, tf)(fd);
+ if (fd_empty(tf))
+ return -EBADF;
/* The target file descriptor must support poll */
- error = -EPERM;
- if (!file_can_poll(tf.file))
- goto error_tgt_fput;
+ if (!file_can_poll(fd_file(tf)))
+ return -EPERM;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
@@ -2142,7 +2318,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
- if (f.file == tf.file || !is_file_epoll(f.file))
+ if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
goto error_tgt_fput;
/*
@@ -2153,7 +2329,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
- if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+ if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
@@ -2162,7 +2338,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
- ep = f.file->private_data;
+ ep = fd_file(f)->private_data;
/*
* When we insert an epoll file descriptor inside another epoll file
@@ -2183,16 +2359,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
- if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
- is_file_epoll(tf.file)) {
+ if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
+ is_file_epoll(fd_file(tf))) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
if (error)
goto error_tgt_fput;
loop_check_gen++;
full_check = 1;
- if (is_file_epoll(tf.file)) {
- tep = tf.file->private_data;
+ if (is_file_epoll(fd_file(tf))) {
+ tep = fd_file(tf)->private_data;
error = -ELOOP;
if (ep_loop_check(ep, tep) != 0)
goto error_tgt_fput;
@@ -2208,14 +2384,14 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
- epi = ep_find(ep, tf.file, fd);
+ epi = ep_find(ep, fd_file(tf), fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
- error = ep_insert(ep, epds, tf.file, fd, full_check);
+ error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
} else
error = -EEXIST;
break;
@@ -2249,12 +2425,6 @@ error_tgt_fput:
loop_check_gen++;
mutex_unlock(&epnested_mutex);
}
-
- fdput(tf);
-error_fput:
- fdput(f);
-error_return:
-
return error;
}
@@ -2282,8 +2452,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
- int error;
- struct fd f;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
@@ -2295,30 +2463,25 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
- f = fdget(epfd);
- if (!f.file)
+ CLASS(fd, f)(epfd);
+ if (fd_empty(f))
return -EBADF;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
- error = -EINVAL;
- if (!is_file_epoll(f.file))
- goto error_fput;
+ if (!is_file_epoll(fd_file(f)))
+ return -EINVAL;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
- ep = f.file->private_data;
+ ep = fd_file(f)->private_data;
/* Time to fish for events ... */
- error = ep_poll(ep, events, maxevents, to);
-
-error_fput:
- fdput(f);
- return error;
+ return ep_poll(ep, events, maxevents, to);
}
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,