summaryrefslogtreecommitdiff
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c237
1 files changed, 114 insertions, 123 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..602ca4285b2e 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -95,9 +95,9 @@
/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
-#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
+#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
-#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
+#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
/* Maximum number of nesting allowed inside epoll sets */
@@ -260,6 +260,7 @@ struct ep_pqueue {
struct ep_send_events_data {
int maxevents;
struct epoll_event __user *events;
+ int res;
};
/*
@@ -276,12 +277,6 @@ static DEFINE_MUTEX(epmutex);
/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;
-/* Used for safe wake up implementation */
-static struct nested_calls poll_safewake_ncalls;
-
-/* Used to call file's f_op->poll() under the nested calls boundaries */
-static struct nested_calls poll_readywalk_ncalls;
-
/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __read_mostly;
@@ -551,40 +546,21 @@ out_unlock:
* this special case of epoll.
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
- unsigned long events, int subclass)
+
+static struct nested_calls poll_safewake_ncalls;
+
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
{
unsigned long flags;
+ wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
- spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
- wake_up_locked_poll(wqueue, events);
+ spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
+ wake_up_locked_poll(wqueue, EPOLLIN);
spin_unlock_irqrestore(&wqueue->lock, flags);
-}
-#else
-static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
- unsigned long events, int subclass)
-{
- wake_up_poll(wqueue, events);
-}
-#endif
-static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
-{
- ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
- 1 + call_nests);
return 0;
}
-/*
- * Perform a safe wake up of the poll wait list. The problem is that
- * with the new callback'd wake up system, it is possible that the
- * poll callback is reentered from inside the call to wake_up() done
- * on the poll wait queue head. The rule is that we cannot reenter the
- * wake up code from the same task more than EP_MAX_NESTS times,
- * and we cannot reenter the same wait queue head at all. This will
- * enable to have a hierarchy of epoll file descriptor of no more than
- * EP_MAX_NESTS deep.
- */
static void ep_poll_safewake(wait_queue_head_t *wq)
{
int this_cpu = get_cpu();
@@ -595,6 +571,15 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
put_cpu();
}
+#else
+
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+ wake_up_poll(wq, EPOLLIN);
+}
+
+#endif
+
static void ep_remove_wait_queue(struct eppoll_entry *pwq)
{
wait_queue_head_t *whead;
@@ -676,12 +661,13 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
*
* Returns: The same integer error code returned by the @sproc callback.
*/
-static int ep_scan_ready_list(struct eventpoll *ep,
- int (*sproc)(struct eventpoll *,
+static __poll_t ep_scan_ready_list(struct eventpoll *ep,
+ __poll_t (*sproc)(struct eventpoll *,
struct list_head *, void *),
void *priv, int depth, bool ep_locked)
{
- int error, pwake = 0;
+ __poll_t res;
+ int pwake = 0;
unsigned long flags;
struct epitem *epi, *nepi;
LIST_HEAD(txlist);
@@ -710,7 +696,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
/*
* Now call the callback function.
*/
- error = (*sproc)(ep, &txlist, priv);
+ res = (*sproc)(ep, &txlist, priv);
spin_lock_irqsave(&ep->lock, flags);
/*
@@ -763,7 +749,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
if (pwake)
ep_poll_safewake(&ep->poll_wait);
- return error;
+ return res;
}
static void epi_rcu_free(struct rcu_head *head)
@@ -880,25 +866,50 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
return 0;
}
-static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
+static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+ void *priv);
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+ poll_table *pt);
+
+/*
+ * Differs from ep_eventpoll_poll() in that internal callers already have
+ * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
+ * is correctly annotated.
+ */
+static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
+ int depth)
{
+ struct eventpoll *ep;
+ bool locked;
+
pt->_key = epi->event.events;
+ if (!is_file_epoll(epi->ffd.file))
+ return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
+ epi->event.events;
+
+ ep = epi->ffd.file->private_data;
+ poll_wait(epi->ffd.file, &ep->poll_wait, pt);
+ locked = pt && (pt->_qproc == ep_ptable_queue_proc);
- return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+ return ep_scan_ready_list(epi->ffd.file->private_data,
+ ep_read_events_proc, &depth, depth,
+ locked) & epi->event.events;
}
-static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
struct epitem *epi, *tmp;
poll_table pt;
+ int depth = *(int *)priv;
init_poll_funcptr(&pt, NULL);
+ depth++;
list_for_each_entry_safe(epi, tmp, head, rdllink) {
- if (ep_item_poll(epi, &pt))
- return POLLIN | POLLRDNORM;
- else {
+ if (ep_item_poll(epi, &pt, depth)) {
+ return EPOLLIN | EPOLLRDNORM;
+ } else {
/*
* Item has been dropped into the ready list by the poll
* callback, but it's not actually ready, as far as
@@ -912,48 +923,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
return 0;
}
-static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
- poll_table *pt);
-
-struct readyevents_arg {
- struct eventpoll *ep;
- bool locked;
-};
-
-static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
+static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
{
- struct readyevents_arg *arg = priv;
-
- return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
- call_nests + 1, arg->locked);
-}
-
-static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
-{
- int pollflags;
struct eventpoll *ep = file->private_data;
- struct readyevents_arg arg;
-
- /*
- * During ep_insert() we already hold the ep->mtx for the tfile.
- * Prevent re-aquisition.
- */
- arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
- arg.ep = ep;
+ int depth = 0;
/* Insert inside our poll wait queue */
poll_wait(file, &ep->poll_wait, wait);
/*
* Proceed to find out if wanted events are really available inside
- * the ready list. This need to be done under ep_call_nested()
- * supervision, since the call to f_op->poll() done on listed files
- * could re-enter here.
+ * the ready list.
*/
- pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
- ep_poll_readyevents_proc, &arg, ep, current);
-
- return pollflags != -1 ? pollflags : 0;
+ return ep_scan_ready_list(ep, ep_read_events_proc,
+ &depth, depth, false);
}
#ifdef CONFIG_PROC_FS
@@ -1137,6 +1120,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
+ __poll_t pollflags = key_to_poll(key);
int ewake = 0;
spin_lock_irqsave(&ep->lock, flags);
@@ -1158,7 +1142,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
- if (key && !((unsigned long) key & epi->event.events))
+ if (pollflags && !(pollflags & epi->event.events))
goto out_unlock;
/*
@@ -1195,14 +1179,14 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
*/
if (waitqueue_active(&ep->wq)) {
if ((epi->event.events & EPOLLEXCLUSIVE) &&
- !((unsigned long)key & POLLFREE)) {
- switch ((unsigned long)key & EPOLLINOUT_BITS) {
- case POLLIN:
- if (epi->event.events & POLLIN)
+ !(pollflags & POLLFREE)) {
+ switch (pollflags & EPOLLINOUT_BITS) {
+ case EPOLLIN:
+ if (epi->event.events & EPOLLIN)
ewake = 1;
break;
- case POLLOUT:
- if (epi->event.events & POLLOUT)
+ case EPOLLOUT:
+ if (epi->event.events & EPOLLOUT)
ewake = 1;
break;
case 0:
@@ -1225,7 +1209,7 @@ out_unlock:
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
- if ((unsigned long)key & POLLFREE) {
+ if (pollflags & POLLFREE) {
/*
* If we race with ep_remove_wait_queue() it can miss
* ->whead = NULL and do another remove_wait_queue() after
@@ -1429,10 +1413,11 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
/*
* Must be called with "mtx" held.
*/
-static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
- int error, revents, pwake = 0;
+ int error, pwake = 0;
+ __poll_t revents;
unsigned long flags;
long user_watches;
struct epitem *epi;
@@ -1472,7 +1457,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
* this operation completes, the poll callback can start hitting
* the new item.
*/
- revents = ep_item_poll(epi, &epq.pt);
+ revents = ep_item_poll(epi, &epq.pt, 1);
/*
* We have to check if something went wrong during the poll wait queue
@@ -1506,7 +1491,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
ep_set_busy_poll_napi_id(epi);
/* If the file is already "ready" we drop it inside the ready list */
- if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
+ if (revents && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
@@ -1560,10 +1545,10 @@ error_create_wakeup_source:
* Modify the interest event mask by dropping an event if the new mask
* has a match in the current file status. Must be called with "mtx" held.
*/
-static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
+static int ep_modify(struct eventpoll *ep, struct epitem *epi,
+ const struct epoll_event *event)
{
int pwake = 0;
- unsigned int revents;
poll_table pt;
init_poll_funcptr(&pt, NULL);
@@ -1605,14 +1590,10 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
/*
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
- */
- revents = ep_item_poll(epi, &pt);
-
- /*
* If the item is "hot" and it is not registered inside the ready
* list, push it inside.
*/
- if (revents & event->events) {
+ if (ep_item_poll(epi, &pt, 1)) {
spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1634,12 +1615,11 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
return 0;
}
-static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
+static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
struct ep_send_events_data *esed = priv;
- int eventcnt;
- unsigned int revents;
+ __poll_t revents;
struct epitem *epi;
struct epoll_event __user *uevent;
struct wakeup_source *ws;
@@ -1652,8 +1632,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
* Items cannot vanish during the loop because ep_scan_ready_list() is
* holding "mtx" during this call.
*/
- for (eventcnt = 0, uevent = esed->events;
- !list_empty(head) && eventcnt < esed->maxevents;) {
+ for (esed->res = 0, uevent = esed->events;
+ !list_empty(head) && esed->res < esed->maxevents;) {
epi = list_first_entry(head, struct epitem, rdllink);
/*
@@ -1674,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
list_del_init(&epi->rdllink);
- revents = ep_item_poll(epi, &pt);
+ revents = ep_item_poll(epi, &pt, 1);
/*
* If the event mask intersect the caller-requested one,
@@ -1687,9 +1667,11 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
ep_pm_stay_awake(epi);
- return eventcnt ? eventcnt : -EFAULT;
+ if (!esed->res)
+ esed->res = -EFAULT;
+ return 0;
}
- eventcnt++;
+ esed->res++;
uevent++;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
@@ -1711,7 +1693,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
}
}
- return eventcnt;
+ return 0;
}
static int ep_send_events(struct eventpoll *ep,
@@ -1722,7 +1704,8 @@ static int ep_send_events(struct eventpoll *ep,
esed.maxevents = maxevents;
esed.events = events;
- return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
+ ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
+ return esed.res;
}
static inline struct timespec64 ep_set_mstimeout(long ms)
@@ -1953,7 +1936,7 @@ static void clear_tfile_check_list(void)
/*
* Open an eventpoll file descriptor.
*/
-SYSCALL_DEFINE1(epoll_create1, int, flags)
+static int do_epoll_create(int flags)
{
int error, fd;
struct eventpoll *ep = NULL;
@@ -1996,12 +1979,17 @@ out_free_ep:
return error;
}
+SYSCALL_DEFINE1(epoll_create1, int, flags)
+{
+ return do_epoll_create(flags);
+}
+
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
- return sys_epoll_create1(0);
+ return do_epoll_create(0);
}
/*
@@ -2122,7 +2110,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
- epds.events |= POLLERR | POLLHUP;
+ epds.events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, &epds, tf.file, fd, full_check);
} else
error = -EEXIST;
@@ -2138,7 +2126,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
- epds.events |= POLLERR | POLLHUP;
+ epds.events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, &epds);
}
} else
@@ -2165,8 +2153,8 @@ error_return:
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
-SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
- int, maxevents, int, timeout)
+static int do_epoll_wait(int epfd, struct epoll_event __user *events,
+ int maxevents, int timeout)
{
int error;
struct fd f;
@@ -2207,6 +2195,12 @@ error_fput:
return error;
}
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+ int, maxevents, int, timeout)
+{
+ return do_epoll_wait(epfd, events, maxevents, timeout);
+}
+
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).
@@ -2231,7 +2225,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
set_current_blocked(&ksigmask);
}
- error = sys_epoll_wait(epfd, events, maxevents, timeout);
+ error = do_epoll_wait(epfd, events, maxevents, timeout);
/*
* If we changed the signal mask, we need to restore the original one.
@@ -2259,7 +2253,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
compat_size_t, sigsetsize)
{
long err;
- compat_sigset_t csigmask;
sigset_t ksigmask, sigsaved;
/*
@@ -2269,14 +2262,13 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
if (sigmask) {
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+ if (get_compat_sigset(&ksigmask, sigmask))
return -EFAULT;
- sigset_from_compat(&ksigmask, &csigmask);
sigsaved = current->blocked;
set_current_blocked(&ksigmask);
}
- err = sys_epoll_wait(epfd, events, maxevents, timeout);
+ err = do_epoll_wait(epfd, events, maxevents, timeout);
/*
* If we changed the signal mask, we need to restore the original one.
@@ -2315,11 +2307,10 @@ static int __init eventpoll_init(void)
*/
ep_nested_calls_init(&poll_loop_ncalls);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_nested_calls_init(&poll_safewake_ncalls);
-
- /* Initialize the structure used to perform file's f_op->poll() calls */
- ep_nested_calls_init(&poll_readywalk_ncalls);
+#endif
/*
* We can have many thousands of epitems, so prevent this from
@@ -2329,11 +2320,11 @@ static int __init eventpoll_init(void)
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
/* Allocates slab cache used to allocate "struct eppoll_entry" */
pwq_cache = kmem_cache_create("eventpoll_pwq",
- sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
+ sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
return 0;
}