From 37056719bba500d0d2b8216fdf641e5507ec9a0e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 24 Mar 2017 10:08:18 -0700 Subject: net: Track start of busy loop instead of when it should end This patch flips the logic we were using to determine if the busy polling has timed out. The main motivation for this is that we will need to support two different possible timeout values in the future and by recording the start time rather than when we would want to end we can focus on making the end_time specific to the task be it epoll or socket based polling. Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- fs/select.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index e2112270d75a..9287d3a96e35 100644 --- a/fs/select.c +++ b/fs/select.c @@ -409,7 +409,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) int retval, i, timed_out = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -512,11 +512,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@ -800,7 +800,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, int timed_out = 0, count = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -853,11 +853,11 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; -- cgit From bf3b9f6372c45b0fbf24d86b8794910d20170017 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 24 Mar 2017 10:08:30 -0700 Subject: epoll: Add busy poll support to epoll with socket fds. This patch adds busy poll support to epoll. The implementation is meant to be opportunistic in that it will take the NAPI ID from the last socket that is added to the ready list that contains a valid NAPI ID and it will use that for busy polling until the ready list goes empty. Once the ready list goes empty the NAPI ID is reset and busy polling is disabled until a new socket is added to the ready list. In addition when we insert a new socket into the epoll we record the NAPI ID and assume we are going to receive events on it. If that doesn't occur it will be evicted as the active NAPI ID and we will resume normal behavior. An application can use SO_INCOMING_CPU or SO_REUSEPORT_ATTACH_C/EBPF socket options to spread the incoming connections to specific worker threads based on the incoming queue. This enables epoll for each worker thread to have only sockets that receive packets from a single queue. So when an application calls epoll_wait() and there are no events available to report, busy polling is done on the associated queue to pull the packets. Signed-off-by: Sridhar Samudrala Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- fs/eventpoll.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 341251421ced..5420767c9b68 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -42,6 +42,7 @@ #include #include #include +#include /* * LOCKING: @@ -224,6 +225,11 @@ struct eventpoll { /* used to optimize loop detection check */ int visited; struct list_head visited_list_link; + +#ifdef CONFIG_NET_RX_BUSY_POLL + /* used to track busy poll napi_id */ + unsigned int napi_id; +#endif }; /* Wait structure used by the poll hooks */ @@ -384,6 +390,77 @@ static inline int ep_events_available(struct eventpoll *ep) return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static bool ep_busy_loop_end(void *p, unsigned long start_time) +{ + struct eventpoll *ep = p; + + return ep_events_available(ep) || busy_loop_timeout(start_time); +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +/* + * Busy poll if globally on and supporting sockets found && no events, + * busy loop will return if need_resched or ep_events_available. + * + * we must do our busy polling with irqs enabled + */ +static void ep_busy_loop(struct eventpoll *ep, int nonblock) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); +#endif +} + +static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + if (ep->napi_id) + ep->napi_id = 0; +#endif +} + +/* + * Set epoll busy poll NAPI ID from sk. + */ +static inline void ep_set_busy_poll_napi_id(struct epitem *epi) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + struct eventpoll *ep; + unsigned int napi_id; + struct socket *sock; + struct sock *sk; + int err; + + if (!net_busy_loop_on()) + return; + + sock = sock_from_file(epi->ffd.file, &err); + if (!sock) + return; + + sk = sock->sk; + if (!sk) + return; + + napi_id = READ_ONCE(sk->sk_napi_id); + ep = epi->ep; + + /* Non-NAPI IDs can be rejected + * or + * Nothing to do if we already have this ID + */ + if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + return; + + /* record NAPI ID for use in next busy poll */ + ep->napi_id = napi_id; +#endif +} + /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that @@ -1022,6 +1099,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k spin_lock_irqsave(&ep->lock, flags); + ep_set_busy_poll_napi_id(epi); + /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the @@ -1363,6 +1442,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); + /* record NAPI ID of new item if present */ + ep_set_busy_poll_napi_id(epi); + /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1637,9 +1719,20 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } fetch_events: + + if (!ep_events_available(ep)) + ep_busy_loop(ep, timed_out); + spin_lock_irqsave(&ep->lock, flags); if (!ep_events_available(ep)) { + /* + * Busy poll timed out. Drop NAPI ID for now, we can add + * it back in when we have moved a socket with a valid NAPI + * ID onto the ready list. + */ + ep_reset_busy_poll_napi_id(ep); + /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by -- cgit From c6e970a04bdceb7ef1fdbac6be3bd4cd0a0a02bd Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 28 Mar 2017 23:45:06 +0200 Subject: net: break include loop netdevice.h, dsa.h, devlink.h There is an include loop between netdevice.h, dsa.h, devlink.h because of NETDEV_ALIGN, making it impossible to use devlink structures in dsa.h. Break this loop by taking dsa.h out of netdevice.h, add a forward declaration of dsa_switch_tree and netdev_set_default_ethtool_ops() function, which is what netdevice.h requires. No longer having dsa.h in netdevice.h means the includes in dsa.h no longer get included. This breaks a few other files which depend on these includes. Add these directly in the affected file. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- fs/cifs/cifsfs.c | 1 + fs/cifs/connect.c | 1 + fs/cifs/smb2pdu.c | 1 + 3 files changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 15e1db8738ae..8c91f37ac0eb 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include "cifsfs.h" diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9ae695ae3ed7..858698dcde3c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7446496850a3..fb75fe908225 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include "smb2pdu.h" -- cgit From 3a92789af0d625caff1e0bf5701aec8edf0d057d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 6 Apr 2017 10:11:56 +0100 Subject: rxrpc: Use negative error codes in rxrpc_call struct Use negative error codes in struct rxrpc_call::error because that's what the kernel normally deals with and to make the code consistent. We only turn them positive when transcribing into a cmsg for userspace recvmsg. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8f76b13d5549..d5990eb160bd 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -419,7 +419,7 @@ error_do_abort: call->state = AFS_CALL_COMPLETE; if (ret != -ECONNABORTED) { rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, - -ret, "KSD"); + ret, "KSD"); } else { abort_code = 0; offset = 0; @@ -478,12 +478,12 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENOTCONN: abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, -ret, "KNC"); + abort_code, ret, "KNC"); goto save_error; case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, -ret, "KIV"); + abort_code, ret, "KIV"); goto save_error; case -ENODATA: case -EBADMSG: @@ -493,7 +493,7 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state != AFS_CALL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, EBADMSG, "KUM"); + abort_code, -EBADMSG, "KUM"); goto save_error; } } @@ -754,7 +754,7 @@ void afs_send_empty_reply(struct afs_call *call) case -ENOMEM: _debug("oom"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT, ENOMEM, "KOO"); + RX_USER_ABORT, -ENOMEM, "KOO"); default: _leave(" [error]"); return; @@ -792,7 +792,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT, ENOMEM, "KOO"); + RX_USER_ABORT, -ENOMEM, "KOO"); } _leave(" [error]"); } -- cgit