diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 237 |
1 files changed, 114 insertions, 123 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..602ca4285b2e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -95,9 +95,9 @@ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) -#define EPOLLINOUT_BITS (POLLIN | POLLOUT) +#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) -#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \ +#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ @@ -260,6 +260,7 @@ struct ep_pqueue { struct ep_send_events_data { int maxevents; struct epoll_event __user *events; + int res; }; /* @@ -276,12 +277,6 @@ static DEFINE_MUTEX(epmutex); /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; -/* Used for safe wake up implementation */ -static struct nested_calls poll_safewake_ncalls; - -/* Used to call file's f_op->poll() under the nested calls boundaries */ -static struct nested_calls poll_readywalk_ncalls; - /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -551,40 +546,21 @@ out_unlock: * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) + +static struct nested_calls poll_safewake_ncalls; + +static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) { unsigned long flags; + wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; - spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); - wake_up_locked_poll(wqueue, events); + spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); + wake_up_locked_poll(wqueue, EPOLLIN); spin_unlock_irqrestore(&wqueue->lock, flags); -} -#else -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) -{ - wake_up_poll(wqueue, events); -} -#endif -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) -{ - ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, - 1 + call_nests); return 0; } -/* - * Perform a safe wake up of the poll wait list. The problem is that - * with the new callback'd wake up system, it is possible that the - * poll callback is reentered from inside the call to wake_up() done - * on the poll wait queue head. The rule is that we cannot reenter the - * wake up code from the same task more than EP_MAX_NESTS times, - * and we cannot reenter the same wait queue head at all. This will - * enable to have a hierarchy of epoll file descriptor of no more than - * EP_MAX_NESTS deep. - */ static void ep_poll_safewake(wait_queue_head_t *wq) { int this_cpu = get_cpu(); @@ -595,6 +571,15 @@ static void ep_poll_safewake(wait_queue_head_t *wq) put_cpu(); } +#else + +static void ep_poll_safewake(wait_queue_head_t *wq) +{ + wake_up_poll(wq, EPOLLIN); +} + +#endif + static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; @@ -676,12 +661,13 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) * * Returns: The same integer error code returned by the @sproc callback. */ -static int ep_scan_ready_list(struct eventpoll *ep, - int (*sproc)(struct eventpoll *, +static __poll_t ep_scan_ready_list(struct eventpoll *ep, + __poll_t (*sproc)(struct eventpoll *, struct list_head *, void *), void *priv, int depth, bool ep_locked) { - int error, pwake = 0; + __poll_t res; + int pwake = 0; unsigned long flags; struct epitem *epi, *nepi; LIST_HEAD(txlist); @@ -710,7 +696,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, /* * Now call the callback function. */ - error = (*sproc)(ep, &txlist, priv); + res = (*sproc)(ep, &txlist, priv); spin_lock_irqsave(&ep->lock, flags); /* @@ -763,7 +749,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, if (pwake) ep_poll_safewake(&ep->poll_wait); - return error; + return res; } static void epi_rcu_free(struct rcu_head *head) @@ -880,25 +866,50 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } -static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv); +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt); + +/* + * Differs from ep_eventpoll_poll() in that internal callers already have + * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() + * is correctly annotated. + */ +static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, + int depth) { + struct eventpoll *ep; + bool locked; + pt->_key = epi->event.events; + if (!is_file_epoll(epi->ffd.file)) + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & + epi->event.events; + + ep = epi->ffd.file->private_data; + poll_wait(epi->ffd.file, &ep->poll_wait, pt); + locked = pt && (pt->_qproc == ep_ptable_queue_proc); - return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; + return ep_scan_ready_list(epi->ffd.file->private_data, + ep_read_events_proc, &depth, depth, + locked) & epi->event.events; } -static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, +static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { struct epitem *epi, *tmp; poll_table pt; + int depth = *(int *)priv; init_poll_funcptr(&pt, NULL); + depth++; list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (ep_item_poll(epi, &pt)) - return POLLIN | POLLRDNORM; - else { + if (ep_item_poll(epi, &pt, depth)) { + return EPOLLIN | EPOLLRDNORM; + } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as @@ -912,48 +923,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, return 0; } -static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, - poll_table *pt); - -struct readyevents_arg { - struct eventpoll *ep; - bool locked; -}; - -static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) +static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { - struct readyevents_arg *arg = priv; - - return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL, - call_nests + 1, arg->locked); -} - -static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) -{ - int pollflags; struct eventpoll *ep = file->private_data; - struct readyevents_arg arg; - - /* - * During ep_insert() we already hold the ep->mtx for the tfile. - * Prevent re-aquisition. - */ - arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc); - arg.ep = ep; + int depth = 0; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside - * the ready list. This need to be done under ep_call_nested() - * supervision, since the call to f_op->poll() done on listed files - * could re-enter here. + * the ready list. */ - pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, - ep_poll_readyevents_proc, &arg, ep, current); - - return pollflags != -1 ? pollflags : 0; + return ep_scan_ready_list(ep, ep_read_events_proc, + &depth, depth, false); } #ifdef CONFIG_PROC_FS @@ -1137,6 +1120,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + __poll_t pollflags = key_to_poll(key); int ewake = 0; spin_lock_irqsave(&ep->lock, flags); @@ -1158,7 +1142,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ - if (key && !((unsigned long) key & epi->event.events)) + if (pollflags && !(pollflags & epi->event.events)) goto out_unlock; /* @@ -1195,14 +1179,14 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && - !((unsigned long)key & POLLFREE)) { - switch ((unsigned long)key & EPOLLINOUT_BITS) { - case POLLIN: - if (epi->event.events & POLLIN) + !(pollflags & POLLFREE)) { + switch (pollflags & EPOLLINOUT_BITS) { + case EPOLLIN: + if (epi->event.events & EPOLLIN) ewake = 1; break; - case POLLOUT: - if (epi->event.events & POLLOUT) + case EPOLLOUT: + if (epi->event.events & EPOLLOUT) ewake = 1; break; case 0: @@ -1225,7 +1209,7 @@ out_unlock: if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; - if ((unsigned long)key & POLLFREE) { + if (pollflags & POLLFREE) { /* * If we race with ep_remove_wait_queue() it can miss * ->whead = NULL and do another remove_wait_queue() after @@ -1429,10 +1413,11 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi) /* * Must be called with "mtx" held. */ -static int ep_insert(struct eventpoll *ep, struct epoll_event *event, +static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct file *tfile, int fd, int full_check) { - int error, revents, pwake = 0; + int error, pwake = 0; + __poll_t revents; unsigned long flags; long user_watches; struct epitem *epi; @@ -1472,7 +1457,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * this operation completes, the poll callback can start hitting * the new item. */ - revents = ep_item_poll(epi, &epq.pt); + revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue @@ -1506,7 +1491,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ - if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { + if (revents && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1560,10 +1545,10 @@ error_create_wakeup_source: * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ -static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event) +static int ep_modify(struct eventpoll *ep, struct epitem *epi, + const struct epoll_event *event) { int pwake = 0; - unsigned int revents; poll_table pt; init_poll_funcptr(&pt, NULL); @@ -1605,14 +1590,10 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. - */ - revents = ep_item_poll(epi, &pt); - - /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ - if (revents & event->events) { + if (ep_item_poll(epi, &pt, 1)) { spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1634,12 +1615,11 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even return 0; } -static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, +static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { struct ep_send_events_data *esed = priv; - int eventcnt; - unsigned int revents; + __poll_t revents; struct epitem *epi; struct epoll_event __user *uevent; struct wakeup_source *ws; @@ -1652,8 +1632,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. */ - for (eventcnt = 0, uevent = esed->events; - !list_empty(head) && eventcnt < esed->maxevents;) { + for (esed->res = 0, uevent = esed->events; + !list_empty(head) && esed->res < esed->maxevents;) { epi = list_first_entry(head, struct epitem, rdllink); /* @@ -1674,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, list_del_init(&epi->rdllink); - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the event mask intersect the caller-requested one, @@ -1687,9 +1667,11 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); ep_pm_stay_awake(epi); - return eventcnt ? eventcnt : -EFAULT; + if (!esed->res) + esed->res = -EFAULT; + return 0; } - eventcnt++; + esed->res++; uevent++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; @@ -1711,7 +1693,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, } } - return eventcnt; + return 0; } static int ep_send_events(struct eventpoll *ep, @@ -1722,7 +1704,8 @@ static int ep_send_events(struct eventpoll *ep, esed.maxevents = maxevents; esed.events = events; - return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); + ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); + return esed.res; } static inline struct timespec64 ep_set_mstimeout(long ms) @@ -1953,7 +1936,7 @@ static void clear_tfile_check_list(void) /* * Open an eventpoll file descriptor. */ -SYSCALL_DEFINE1(epoll_create1, int, flags) +static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; @@ -1996,12 +1979,17 @@ out_free_ep: return error; } +SYSCALL_DEFINE1(epoll_create1, int, flags) +{ + return do_epoll_create(flags); +} + SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; - return sys_epoll_create1(0); + return do_epoll_create(0); } /* @@ -2122,7 +2110,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= POLLERR | POLLHUP; + epds.events |= EPOLLERR | EPOLLHUP; error = ep_insert(ep, &epds, tf.file, fd, full_check); } else error = -EEXIST; @@ -2138,7 +2126,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { - epds.events |= POLLERR | POLLHUP; + epds.events |= EPOLLERR | EPOLLHUP; error = ep_modify(ep, epi, &epds); } } else @@ -2165,8 +2153,8 @@ error_return: * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ -SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, - int, maxevents, int, timeout) +static int do_epoll_wait(int epfd, struct epoll_event __user *events, + int maxevents, int timeout) { int error; struct fd f; @@ -2207,6 +2195,12 @@ error_fput: return error; } +SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout) +{ + return do_epoll_wait(epfd, events, maxevents, timeout); +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). @@ -2231,7 +2225,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, set_current_blocked(&ksigmask); } - error = sys_epoll_wait(epfd, events, maxevents, timeout); + error = do_epoll_wait(epfd, events, maxevents, timeout); /* * If we changed the signal mask, we need to restore the original one. @@ -2259,7 +2253,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, compat_size_t, sigsetsize) { long err; - compat_sigset_t csigmask; sigset_t ksigmask, sigsaved; /* @@ -2269,14 +2262,13 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, if (sigmask) { if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) + if (get_compat_sigset(&ksigmask, sigmask)) return -EFAULT; - sigset_from_compat(&ksigmask, &csigmask); sigsaved = current->blocked; set_current_blocked(&ksigmask); } - err = sys_epoll_wait(epfd, events, maxevents, timeout); + err = do_epoll_wait(epfd, events, maxevents, timeout); /* * If we changed the signal mask, we need to restore the original one. @@ -2315,11 +2307,10 @@ static int __init eventpoll_init(void) */ ep_nested_calls_init(&poll_loop_ncalls); +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); - - /* Initialize the structure used to perform file's f_op->poll() calls */ - ep_nested_calls_init(&poll_readywalk_ncalls); +#endif /* * We can have many thousands of epitems, so prevent this from @@ -2329,11 +2320,11 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); + sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } |