diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 252 |
1 files changed, 176 insertions, 76 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 882b89edc52a..d4dbffdedd08 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -318,7 +318,7 @@ static void unlist_file(struct epitems_head *head) static long long_zero; static long long_max = LONG_MAX; -static struct ctl_table epoll_table[] = { +static const struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, @@ -420,7 +420,9 @@ static bool busy_loop_ep_timeout(unsigned long start_time, static bool ep_busy_loop_on(struct eventpoll *ep) { - return !!ep->busy_poll_usecs || net_busy_loop_on(); + return !!READ_ONCE(ep->busy_poll_usecs) || + READ_ONCE(ep->prefer_busy_poll) || + net_busy_loop_on(); } static bool ep_busy_loop_end(void *p, unsigned long start_time) @@ -436,7 +438,7 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) * * we must do our busy polling with irqs enabled */ -static bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static bool ep_busy_loop(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); @@ -445,8 +447,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) if (!budget) budget = BUSY_POLL_BUDGET; - if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, + if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) { + napi_busy_loop(napi_id, ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; @@ -455,6 +457,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ + if (prefer_busy_poll) + napi_resume_irqs(napi_id); ep->napi_id = 0; return false; } @@ -488,7 +492,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * or * Nothing to do if we already have this ID */ - if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ @@ -538,9 +542,25 @@ static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, } } +static void ep_suspend_napi_irqs(struct eventpoll *ep) +{ + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) + napi_suspend_irqs(napi_id); +} + +static void ep_resume_napi_irqs(struct eventpoll *ep) +{ + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) + napi_resume_irqs(napi_id); +} + #else -static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static inline bool ep_busy_loop(struct eventpoll *ep) { return false; } @@ -555,6 +575,14 @@ static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, return -EOPNOTSUPP; } +static void ep_suspend_napi_irqs(struct eventpoll *ep) +{ +} + +static void ep_resume_napi_irqs(struct eventpoll *ep) +{ +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ /* @@ -786,6 +814,7 @@ static bool ep_refcount_dec_and_test(struct eventpoll *ep) static void ep_free(struct eventpoll *ep) { + ep_resume_napi_irqs(ep); mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); @@ -823,7 +852,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { - file->f_ep = NULL; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); @@ -980,6 +1010,34 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep } /* + * The ffd.file pointer may be in the process of being torn down due to + * being closed, but we may not have finished eventpoll_release() yet. + * + * Normally, even with the atomic_long_inc_not_zero, the file may have + * been free'd and then gotten re-allocated to something else (since + * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). + * + * But for epoll, users hold the ep->mtx mutex, and as such any file in + * the process of being free'd will block in eventpoll_release_file() + * and thus the underlying file allocation will not be free'd, and the + * file re-use cannot happen. + * + * For the same reason we can avoid a rcu_read_lock() around the + * operation - 'ffd.file' cannot go away even if the refcount has + * reached zero (but we must still not call out to ->poll() functions + * etc). + */ +static struct file *epi_fget(const struct epitem *epi) +{ + struct file *file; + + file = epi->ffd.file; + if (!file_ref_get(&file->f_ref)) + file = NULL; + return file; +} + +/* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. @@ -987,14 +1045,22 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { - struct file *file = epi->ffd.file; + struct file *file = epi_fget(epi); __poll_t res; + /* + * We could return EPOLLERR | EPOLLHUP or something, but let's + * treat this more as "file doesn't exist, poll didn't happen". + */ + if (!file) + return 0; + pt->_key = epi->event.events; if (!is_file_epoll(file)) res = vfs_poll(file, pt); else res = __ep_eventpoll_poll(file, pt, depth); + fput(file); return res & epi->event.events; } @@ -1336,7 +1402,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v break; } } - wake_up(&ep->wq); + if (sync) + wake_up_sync(&ep->wq); + else + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1567,7 +1636,8 @@ allocate: spin_unlock(&file->f_lock); goto allocate; } - file->f_ep = head; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); @@ -1910,6 +1980,30 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, return ret; } +static int ep_try_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) +{ + int res; + + /* + * Try to transfer events to user space. In case we get 0 events and + * there's still timeout left over, we go trying again in search of + * more luck. + */ + res = ep_send_events(ep, events, maxevents); + if (res > 0) + ep_suspend_napi_irqs(ep); + return res; +} + +static int ep_schedule_timeout(ktime_t *to) +{ + if (to) + return ktime_after(*to, ktime_get()); + else + return 1; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -1961,12 +2055,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, while (1) { if (eavail) { - /* - * Try to transfer events to user space. In case we get - * 0 events and there's still timeout left over, we go - * trying again in search of more luck. - */ - res = ep_send_events(ep, events, maxevents); + res = ep_try_send_events(ep, events, maxevents); if (res) return res; } @@ -1974,7 +2063,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (timed_out) return 0; - eavail = ep_busy_loop(ep, timed_out); + eavail = ep_busy_loop(ep); if (eavail) continue; @@ -2023,8 +2112,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, write_unlock_irq(&ep->lock); if (!eavail) - timed_out = !schedule_hrtimeout_range(to, slack, - HRTIMER_MODE_ABS); + timed_out = !ep_schedule_timeout(to) || + !schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* @@ -2164,11 +2254,6 @@ static int do_epoll_create(int flags) error = PTR_ERR(file); goto out_free_fd; } -#ifdef CONFIG_NET_RX_BUSY_POLL - ep->busy_poll_usecs = 0; - ep->busy_poll_budget = 0; - ep->prefer_busy_poll = false; -#endif ep->file = file; fd_install(fd, file); return fd; @@ -2223,25 +2308,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, { int error; int full_check = 0; - struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; - error = -EBADF; - f = fdget(epfd); - if (!f.file) - goto error_return; + CLASS(fd, f)(epfd); + if (fd_empty(f)) + return -EBADF; /* Get the "struct file *" for the target file */ - tf = fdget(fd); - if (!tf.file) - goto error_fput; + CLASS(fd, tf)(fd); + if (fd_empty(tf)) + return -EBADF; /* The target file descriptor must support poll */ - error = -EPERM; - if (!file_can_poll(tf.file)) - goto error_tgt_fput; + if (!file_can_poll(fd_file(tf))) + return -EPERM; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) @@ -2253,7 +2335,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * adding an epoll file descriptor inside itself. */ error = -EINVAL; - if (f.file == tf.file || !is_file_epoll(f.file)) + if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f))) goto error_tgt_fput; /* @@ -2264,7 +2346,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; - if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || + if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } @@ -2273,7 +2355,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = f.file->private_data; + ep = fd_file(f)->private_data; /* * When we insert an epoll file descriptor inside another epoll file @@ -2294,16 +2376,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { - if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || - is_file_epoll(tf.file)) { + if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen || + is_file_epoll(fd_file(tf))) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; full_check = 1; - if (is_file_epoll(tf.file)) { - tep = tf.file->private_data; + if (is_file_epoll(fd_file(tf))) { + tep = fd_file(tf)->private_data; error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; @@ -2319,14 +2401,14 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ - epi = ep_find(ep, tf.file, fd); + epi = ep_find(ep, fd_file(tf), fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; - error = ep_insert(ep, epds, tf.file, fd, full_check); + error = ep_insert(ep, epds, fd_file(tf), fd, full_check); } else error = -EEXIST; break; @@ -2360,12 +2442,6 @@ error_tgt_fput: loop_check_gen++; mutex_unlock(&epnested_mutex); } - - fdput(tf); -error_fput: - fdput(f); -error_return: - return error; } @@ -2386,50 +2462,74 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, return do_epoll_ctl(epfd, op, fd, &epds, false); } -/* - * Implement the event wait interface for the eventpoll file. It is the kernel - * part of the user space epoll_wait(2). - */ -static int do_epoll_wait(int epfd, struct epoll_event __user *events, - int maxevents, struct timespec64 *to) +static int ep_check_params(struct file *file, struct epoll_event __user *evs, + int maxevents) { - int error; - struct fd f; - struct eventpoll *ep; - /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ - if (!access_ok(events, maxevents * sizeof(struct epoll_event))) + if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) return -EFAULT; - /* Get the "struct file *" for the eventpoll file */ - f = fdget(epfd); - if (!f.file) - return -EBADF; - /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ - error = -EINVAL; - if (!is_file_epoll(f.file)) - goto error_fput; + if (!is_file_epoll(file)) + return -EINVAL; + + return 0; +} + +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents) +{ + struct eventpoll *ep; + int ret; + + ret = ep_check_params(file, events, maxevents); + if (unlikely(ret)) + return ret; + + ep = file->private_data; + /* + * Racy call, but that's ok - it should get retried based on + * poll readiness anyway. + */ + if (ep_events_available(ep)) + return ep_try_send_events(ep, events, maxevents); + return 0; +} + +/* + * Implement the event wait interface for the eventpoll file. It is the kernel + * part of the user space epoll_wait(2). + */ +static int do_epoll_wait(int epfd, struct epoll_event __user *events, + int maxevents, struct timespec64 *to) +{ + struct eventpoll *ep; + int ret; + + /* Get the "struct file *" for the eventpoll file */ + CLASS(fd, f)(epfd); + if (fd_empty(f)) + return -EBADF; + + ret = ep_check_params(fd_file(f), events, maxevents); + if (unlikely(ret)) + return ret; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = f.file->private_data; + ep = fd_file(f)->private_data; /* Time to fish for events ... */ - error = ep_poll(ep, events, maxevents, to); - -error_fput: - fdput(f); - return error; + return ep_poll(ep, events, maxevents, to); } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, |