diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 289 |
1 files changed, 226 insertions, 63 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3534d36a1474..7c0980db77b3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -37,6 +37,7 @@ #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> +#include <linux/capability.h> #include <net/busy_poll.h> /* @@ -206,7 +207,7 @@ struct eventpoll { */ struct epitem *ovflist; - /* wakeup_source used when ep_scan_ready_list is running */ + /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ @@ -227,6 +228,11 @@ struct eventpoll { #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; + /* busy poll timeout */ + u32 busy_poll_usecs; + /* busy poll packet budget */ + u16 busy_poll_budget; + bool prefer_busy_poll; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -312,7 +318,7 @@ static void unlist_file(struct epitems_head *head) static long long_zero; static long long_max = LONG_MAX; -static struct ctl_table epoll_table[] = { +static const struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, @@ -387,11 +393,43 @@ static inline int ep_events_available(struct eventpoll *ep) } #ifdef CONFIG_NET_RX_BUSY_POLL +/** + * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value + * from the epoll instance ep is preferred, but if it is not set fallback to + * the system-wide global via busy_loop_timeout. + * + * @start_time: The start time used to compute the remaining time until timeout. + * @ep: Pointer to the eventpoll context. + * + * Return: true if the timeout has expired, false otherwise. + */ +static bool busy_loop_ep_timeout(unsigned long start_time, + struct eventpoll *ep) +{ + unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs); + + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); + + return time_after(now, end_time); + } else { + return busy_loop_timeout(start_time); + } +} + +static bool ep_busy_loop_on(struct eventpoll *ep) +{ + return !!READ_ONCE(ep->busy_poll_usecs) || + READ_ONCE(ep->prefer_busy_poll) || + net_busy_loop_on(); +} + static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; - return ep_events_available(ep) || busy_loop_timeout(start_time); + return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep); } /* @@ -403,10 +441,15 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) static bool ep_busy_loop(struct eventpoll *ep, int nonblock) { unsigned int napi_id = READ_ONCE(ep->napi_id); + u16 budget = READ_ONCE(ep->busy_poll_budget); + bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); + + if (!budget) + budget = BUSY_POLL_BUDGET; - if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) { - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, - BUSY_POLL_BUDGET); + if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, + ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; /* @@ -414,6 +457,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ + if (prefer_busy_poll) + napi_resume_irqs(napi_id); ep->napi_id = 0; return false; } @@ -425,12 +470,12 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { - struct eventpoll *ep; + struct eventpoll *ep = epi->ep; unsigned int napi_id; struct socket *sock; struct sock *sk; - if (!net_busy_loop_on()) + if (!ep_busy_loop_on(ep)) return; sock = sock_from_file(epi->ffd.file); @@ -442,7 +487,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) return; napi_id = READ_ONCE(sk->sk_napi_id); - ep = epi->ep; /* Non-NAPI IDs can be rejected * or @@ -455,6 +499,65 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) ep->napi_id = napi_id; } +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct eventpoll *ep = file->private_data; + void __user *uarg = (void __user *)arg; + struct epoll_params epoll_params; + + switch (cmd) { + case EPIOCSPARAMS: + if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params))) + return -EFAULT; + + /* pad byte must be zero */ + if (epoll_params.__pad) + return -EINVAL; + + if (epoll_params.busy_poll_usecs > S32_MAX) + return -EINVAL; + + if (epoll_params.prefer_busy_poll > 1) + return -EINVAL; + + if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT && + !capable(CAP_NET_ADMIN)) + return -EPERM; + + WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs); + WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget); + WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll); + return 0; + case EPIOCGPARAMS: + memset(&epoll_params, 0, sizeof(epoll_params)); + epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs); + epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget); + epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); + if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params))) + return -EFAULT; + return 0; + default: + return -ENOIOCTLCMD; + } +} + +static void ep_suspend_napi_irqs(struct eventpoll *ep) +{ + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + napi_suspend_irqs(napi_id); +} + +static void ep_resume_napi_irqs(struct eventpoll *ep) +{ + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + napi_resume_irqs(napi_id); +} + #else static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) @@ -466,6 +569,20 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return -EOPNOTSUPP; +} + +static void ep_suspend_napi_irqs(struct eventpoll *ep) +{ +} + +static void ep_resume_napi_irqs(struct eventpoll *ep) +{ +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ /* @@ -678,12 +795,6 @@ static void ep_done_scan(struct eventpoll *ep, write_unlock_irq(&ep->lock); } -static void epi_rcu_free(struct rcu_head *head) -{ - struct epitem *epi = container_of(head, struct epitem, rcu); - kmem_cache_free(epi_cache, epi); -} - static void ep_get(struct eventpoll *ep) { refcount_inc(&ep->refcount); @@ -703,6 +814,7 @@ static bool ep_refcount_dec_and_test(struct eventpoll *ep) static void ep_free(struct eventpoll *ep) { + ep_resume_napi_irqs(ep); mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); @@ -740,7 +852,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { - file->f_ep = NULL; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); @@ -767,7 +880,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ - call_rcu(&epi->rcu, epi_rcu_free); + kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); return ep_refcount_dec_and_test(ep); @@ -825,6 +938,27 @@ static void ep_clear_and_put(struct eventpoll *ep) ep_free(ep); } +static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret; + + if (!is_file_epoll(file)) + return -EINVAL; + + switch (cmd) { + case EPIOCSPARAMS: + case EPIOCGPARAMS: + ret = ep_eventpoll_bp_ioctl(file, cmd, arg); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; @@ -876,6 +1010,34 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep } /* + * The ffd.file pointer may be in the process of being torn down due to + * being closed, but we may not have finished eventpoll_release() yet. + * + * Normally, even with the atomic_long_inc_not_zero, the file may have + * been free'd and then gotten re-allocated to something else (since + * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). + * + * But for epoll, users hold the ep->mtx mutex, and as such any file in + * the process of being free'd will block in eventpoll_release_file() + * and thus the underlying file allocation will not be free'd, and the + * file re-use cannot happen. + * + * For the same reason we can avoid a rcu_read_lock() around the + * operation - 'ffd.file' cannot go away even if the refcount has + * reached zero (but we must still not call out to ->poll() functions + * etc). + */ +static struct file *epi_fget(const struct epitem *epi) +{ + struct file *file; + + file = epi->ffd.file; + if (!file_ref_get(&file->f_ref)) + file = NULL; + return file; +} + +/* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. @@ -883,14 +1045,22 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { - struct file *file = epi->ffd.file; + struct file *file = epi_fget(epi); __poll_t res; + /* + * We could return EPOLLERR | EPOLLHUP or something, but let's + * treat this more as "file doesn't exist, poll didn't happen". + */ + if (!file) + return 0; + pt->_key = epi->event.events; if (!is_file_epoll(file)) res = vfs_poll(file, pt); else res = __ep_eventpoll_poll(file, pt, depth); + fput(file); return res & epi->event.events; } @@ -931,6 +1101,8 @@ static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, + .unlocked_ioctl = ep_eventpoll_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; /* @@ -1153,7 +1325,7 @@ static inline bool chain_epi_lockless(struct epitem *epi) * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from - * ep_scan_ready_list(), which stops all list modifications and guarantees + * ep_start/done_scan(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called @@ -1230,7 +1402,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v break; } } - wake_up(&ep->wq); + if (sync) + wake_up_sync(&ep->wq); + else + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1461,7 +1636,8 @@ allocate: spin_unlock(&file->f_lock); goto allocate; } - file->f_ep = head; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); @@ -1751,7 +1927,7 @@ static int ep_send_events(struct eventpoll *ep, * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by - * ep_scan_ready_list() holding "mtx" and the + * ep_send_events() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1861,8 +2037,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * trying again in search of more luck. */ res = ep_send_events(ep, events, maxevents); - if (res) + if (res) { + if (res > 0) + ep_suspend_napi_irqs(ep); return res; + } } if (timed_out) @@ -1904,7 +2083,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, __set_current_state(TASK_INTERRUPTIBLE); /* - * Do the final check under the lock. ep_scan_ready_list() + * Do the final check under the lock. ep_start/done_scan() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is @@ -2112,25 +2291,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, { int error; int full_check = 0; - struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; - error = -EBADF; - f = fdget(epfd); - if (!f.file) - goto error_return; + CLASS(fd, f)(epfd); + if (fd_empty(f)) + return -EBADF; /* Get the "struct file *" for the target file */ - tf = fdget(fd); - if (!tf.file) - goto error_fput; + CLASS(fd, tf)(fd); + if (fd_empty(tf)) + return -EBADF; /* The target file descriptor must support poll */ - error = -EPERM; - if (!file_can_poll(tf.file)) - goto error_tgt_fput; + if (!file_can_poll(fd_file(tf))) + return -EPERM; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) @@ -2142,7 +2318,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * adding an epoll file descriptor inside itself. */ error = -EINVAL; - if (f.file == tf.file || !is_file_epoll(f.file)) + if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f))) goto error_tgt_fput; /* @@ -2153,7 +2329,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; - if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || + if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } @@ -2162,7 +2338,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = f.file->private_data; + ep = fd_file(f)->private_data; /* * When we insert an epoll file descriptor inside another epoll file @@ -2183,16 +2359,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { - if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || - is_file_epoll(tf.file)) { + if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen || + is_file_epoll(fd_file(tf))) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; full_check = 1; - if (is_file_epoll(tf.file)) { - tep = tf.file->private_data; + if (is_file_epoll(fd_file(tf))) { + tep = fd_file(tf)->private_data; error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; @@ -2208,14 +2384,14 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ - epi = ep_find(ep, tf.file, fd); + epi = ep_find(ep, fd_file(tf), fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; - error = ep_insert(ep, epds, tf.file, fd, full_check); + error = ep_insert(ep, epds, fd_file(tf), fd, full_check); } else error = -EEXIST; break; @@ -2249,12 +2425,6 @@ error_tgt_fput: loop_check_gen++; mutex_unlock(&epnested_mutex); } - - fdput(tf); -error_fput: - fdput(f); -error_return: - return error; } @@ -2282,8 +2452,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { - int error; - struct fd f; struct eventpoll *ep; /* The maximum number of event must be greater than zero */ @@ -2295,30 +2463,25 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, return -EFAULT; /* Get the "struct file *" for the eventpoll file */ - f = fdget(epfd); - if (!f.file) + CLASS(fd, f)(epfd); + if (fd_empty(f)) return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ - error = -EINVAL; - if (!is_file_epoll(f.file)) - goto error_fput; + if (!is_file_epoll(fd_file(f))) + return -EINVAL; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = f.file->private_data; + ep = fd_file(f)->private_data; /* Time to fish for events ... */ - error = ep_poll(ep, events, maxevents, to); - -error_fput: - fdput(f); - return error; + return ep_poll(ep, events, maxevents, to); } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, |