From b92cf4aab8e688b1bd501ac2ac4f1b5c99601e3b Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:22 -0500 Subject: net: Create and use new helper xfrm_dst_child(). Only IPSEC routes have a non-NULL dst->child pointer. And IPSEC routes are identified by a non-NULL dst->xfrm pointer. Signed-off-by: David S. Miller --- net/core/dst.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 662a2d4a3d19..6a3c21b8fc8d 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -116,12 +116,14 @@ EXPORT_SYMBOL(dst_alloc); struct dst_entry *dst_destroy(struct dst_entry * dst) { - struct dst_entry *child; + struct dst_entry *child = NULL; smp_rmb(); - child = dst->child; - +#ifdef CONFIG_XFRM + if (dst->xfrm) + child = dst->child; +#endif if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); -- cgit From b6ca8bd5a9198c70c48297390723e4e56bd6e879 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:45:44 -0500 Subject: xfrm: Move child route linkage into xfrm_dst. XFRM bundle child chains look like this: xdst1 --> xdst2 --> xdst3 --> path_dst All of xdstN are xfrm_dst objects and xdst->u.dst.xfrm is non-NULL. The final child pointer in the chain, here called 'path_dst', is some other kind of route such as an ipv4 or ipv6 one. The xfrm output path pops routes, one at a time, via the child pointer, until we hit one which has a dst->xfrm pointer which is NULL. We can easily preserve the above mechanisms with child sitting only in the xfrm_dst structure. All children in the chain before we break out of the xfrm_output() loop have dst->xfrm non-NULL and are therefore xfrm_dst objects. Since we break out of the loop when we find dst->xfrm NULL, we will not try to dereference 'dst' as if it were an xfrm_dst. Signed-off-by: David S. Miller --- net/core/dst.c | 9 ++++++--- net/core/pktgen.c | 12 ++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 6a3c21b8fc8d..5cf96179e8e0 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -62,7 +63,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { - dst->child = NULL; dst->dev = dev; if (dev) dev_hold(dev); @@ -121,8 +121,11 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) smp_rmb(); #ifdef CONFIG_XFRM - if (dst->xfrm) - child = dst->child; + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + + child = xdst->child; + } #endif if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f95a15086225..b9ce241cd28c 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -399,7 +399,7 @@ struct pktgen_dev { __u8 ipsmode; /* IPSEC mode (config) */ __u8 ipsproto; /* IPSEC type (config) */ __u32 spi; - struct dst_entry dst; + struct xfrm_dst xdst; struct dst_ops dstops; #endif char result[512]; @@ -2609,7 +2609,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) * supports both transport/tunnel mode + ESP/AH type. */ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) - skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF; + skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF; rcu_read_lock_bh(); err = x->outer_mode->output(x, skb); @@ -3742,10 +3742,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) * performance under such circumstance. */ pkt_dev->dstops.family = AF_INET; - pkt_dev->dst.dev = pkt_dev->odev; - dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false); - pkt_dev->dst.child = &pkt_dev->dst; - pkt_dev->dst.ops = &pkt_dev->dstops; + pkt_dev->xdst.u.dst.dev = pkt_dev->odev; + dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false); + pkt_dev->xdst.child = &pkt_dev->xdst.u.dst; + pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops; #endif return add_dev_to_thread(t, pkt_dev); -- cgit From 3a2232e92e87166a8a5113e918b8c7b7bdce4d83 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:40 -0500 Subject: ipv6: Move dst->from into struct rt6_info. The dst->from value is only used by ipv6 routes to track where a route "came from". Any time we clone or copy a core ipv6 route in the ipv6 routing tables, we have the copy/clone's ->from point to the base route. This is used to handle route expiration properly. Only ipv6 uses this mechanism, and only ipv6 code references it. So it is safe to move it into rt6_info. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- net/core/dst.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 5cf96179e8e0..cf2076c0eb22 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -70,7 +70,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; dst->path = dst; - dst->from = NULL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif -- cgit From 0f6c480f23f49b53644b383c5554e579498347f3 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:46 -0500 Subject: xfrm: Move dst->path into struct xfrm_dst The first member of an IPSEC route bundle chain sets it's dst->path to the underlying ipv4/ipv6 route that carries the bundle. Stated another way, if one were to follow the xfrm_dst->child chain of the bundle, the final non-NULL pointer would be the path and point to either an ipv4 or an ipv6 route. This is largely used to make sure that PMTU events propagate down to the correct ipv4 or ipv6 route. When we don't have the top of an IPSEC bundle 'dst->path == dst'. Move it down into xfrm_dst and key off of dst->xfrm. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- net/core/dst.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index cf2076c0eb22..9bc3bb6e94ef 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -69,7 +69,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; - dst->path = dst; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif -- cgit From 7149f813d12d92ba9abcf49026f7cebc3d55c426 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:41:07 -0500 Subject: net: Remove dst->next There are no more users. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- net/core/dst.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 9bc3bb6e94ef..007aa0b08291 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -86,7 +86,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; - dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); } -- cgit From e94a62f507f9498e5f4b2c69ef181b3402934c2a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 30 Nov 2017 15:39:34 +0100 Subject: net/reuseport: drop legacy code Since commit e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") and commit c125e80b8868 ("soreuseport: fast reuseport TCP socket selection") the relevant reuseport socket matching the current packet is selected by the reuseport_select_sock() call. The only exceptions are invalid BPF filters/filters returning out-of-range indices. In the latter case the code implicitly falls back to using the hash demultiplexing, but instead of selecting the socket inside the reuseport_select_sock() function, it relies on the hash selection logic introduced with the early soreuseport implementation. With this patch, in case of a BPF filter returning a bad socket index value, we fall back to hash-based selection inside the reuseport_select_sock() body, so that we can drop some duplicate code in the ipv4 and ipv6 stack. This also allows faster lookup in the above scenario and will allow us to avoid computing the hash value for successful, BPF based demultiplexing - in a later patch. Signed-off-by: Paolo Abeni Acked-by: Craig Gallek Signed-off-by: David S. Miller --- net/core/sock_reuseport.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5eeb1d20cc38..c5bb52bc73a1 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -235,7 +235,9 @@ struct sock *reuseport_select_sock(struct sock *sk, if (prog && skb) sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); - else + + /* no bpf or invalid bpf result: fall back to hash usage */ + if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; } -- cgit From 118b4aa25d90d0930611b71dd28a749c67309ccb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 Dec 2017 15:08:55 -0800 Subject: net: xdp: avoid output parameters when querying XDP prog The output parameters will get unwieldy if we want to add more information about the program. Simply pass the entire struct netdev_bpf in. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- net/core/dev.c | 24 ++++++++++++++---------- net/core/rtnetlink.c | 6 +++++- 2 files changed, 19 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 07ed21d64f92..3f271c9cb5e0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7073,17 +7073,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) +void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + struct netdev_bpf *xdp) { - struct netdev_bpf xdp; - - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; + memset(xdp, 0, sizeof(*xdp)); + xdp->command = XDP_QUERY_PROG; /* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0); - if (prog_id) - *prog_id = xdp.prog_id; + WARN_ON(bpf_op(dev, xdp) < 0); +} + +static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) +{ + struct netdev_bpf xdp; + + __dev_xdp_query(dev, bpf_op, &xdp); return xdp.prog_attached; } @@ -7134,10 +7138,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) + if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op, NULL)) + __dev_xdp_attached(dev, bpf_op)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dabba2a91fc8..9c4cb584bfb0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1261,6 +1261,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) { const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; + struct netdev_bpf xdp; ASSERT_RTNL(); @@ -1273,7 +1274,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) if (!ops->ndo_bpf) return XDP_ATTACHED_NONE; - return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id); + __dev_xdp_query(dev, ops->ndo_bpf, &xdp); + *prog_id = xdp.prog_id; + + return xdp.prog_attached; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) -- cgit From bd0b2e7fe611953470ec7c533b455fb2abd382cd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 Dec 2017 15:08:57 -0800 Subject: net: xdp: make the stack take care of the tear down Since day one of XDP drivers had to remember to free the program on the remove path. This leads to code duplication and is error prone. Make the stack query the installed programs on unregister and if something is installed, remove the program. Freeing of program attached to XDP generic is moved from free_netdev() as well. Because the remove will now be called before notifiers are invoked, BPF offload state of the program will not get destroyed before uninstall. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- net/core/dev.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3f271c9cb5e0..6bea8931bb62 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7110,6 +7110,27 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, return bpf_op(dev, &xdp); } +static void dev_xdp_uninstall(struct net_device *dev) +{ + struct netdev_bpf xdp; + bpf_op_t ndo_bpf; + + /* Remove generic XDP */ + WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); + + /* Remove from the driver */ + ndo_bpf = dev->netdev_ops->ndo_bpf; + if (!ndo_bpf) + return; + + __dev_xdp_query(dev, ndo_bpf, &xdp); + if (xdp.prog_attached == XDP_ATTACHED_NONE) + return; + + /* Program removal should always succeed */ + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -7240,6 +7261,7 @@ static void rollback_registered_many(struct list_head *head) /* Shutdown queueing discipline. */ dev_shutdown(dev); + dev_xdp_uninstall(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. @@ -8199,7 +8221,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs); void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; - struct bpf_prog *prog; might_sleep(); netif_free_tx_queues(dev); @@ -8218,12 +8239,6 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; - prog = rcu_dereference_protected(dev->xdp_prog, 1); - if (prog) { - bpf_prog_put(prog); - static_key_slow_dec(&generic_xdp_needed); - } - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); -- cgit From b8da518c6e87504a6790bfb1539134c41b61a45a Mon Sep 17 00:00:00 2001 From: William Tu Date: Fri, 1 Dec 2017 15:26:09 -0800 Subject: bpf: allow disabling tunnel csum for ipv6 Before the patch, BPF_F_ZERO_CSUM_TX can be used only for ipv4 tunnel. With introduction of ip6gretap collect_md mode, the flag should be also supported for ipv6. Signed-off-by: William Tu Cc: Daniel Borkmann Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 6a85e67fafce..8ec5a504eb28 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3026,10 +3026,11 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; } + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; + return 0; } -- cgit From addf9b90de22f7aaad0db39bccb5d51ac47dd4e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 2 Dec 2017 21:44:05 +0100 Subject: net: rtnetlink: use rcu to free rtnl message handlers rtnetlink is littered with READ_ONCE() because we can have read accesses while another cpu can write to the structure we're reading by (un)registering doit or dumpit handlers. This patch changes this so that (un)registering cpu allocates a new structure and then publishes it via rcu_assign_pointer, i.e. once another cpu can see such pointer no modifications will occur anymore. based on initial patch from Peter Zijlstra. Cc: Peter Zijlstra Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 154 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 101 insertions(+), 53 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dabba2a91fc8..ff292d3f2c41 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -63,6 +63,7 @@ struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; unsigned int flags; + struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); @@ -127,7 +128,7 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; +static struct rtnl_link __rcu **rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) @@ -144,6 +145,20 @@ static inline int rtm_msgindex(int msgtype) return msgindex; } +static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) +{ + struct rtnl_link **tab; + + if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) + protocol = PF_UNSPEC; + + tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); + if (!tab) + tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); + + return tab[msgtype]; +} + /** * __rtnl_register - Register a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC @@ -166,28 +181,52 @@ int __rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { - struct rtnl_link *tab; + struct rtnl_link **tab, *link, *old; int msgindex; + int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); - tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]); + rtnl_lock(); + tab = rtnl_msg_handlers[protocol]; if (tab == NULL) { - tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); - if (tab == NULL) - return -ENOBUFS; + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); + if (!tab) + goto unlock; + /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } + old = rtnl_dereference(tab[msgindex]); + if (old) { + link = kmemdup(old, sizeof(*old), GFP_KERNEL); + if (!link) + goto unlock; + } else { + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) + goto unlock; + } + + WARN_ON(doit && link->doit && link->doit != doit); if (doit) - tab[msgindex].doit = doit; + link->doit = doit; + WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) - tab[msgindex].dumpit = dumpit; - tab[msgindex].flags |= flags; + link->dumpit = dumpit; - return 0; + link->flags |= flags; + + /* publish protocol:msgtype */ + rcu_assign_pointer(tab[msgindex], link); + ret = 0; + if (old) + kfree_rcu(old, rcu); +unlock: + rtnl_unlock(); + return ret; } EXPORT_SYMBOL_GPL(__rtnl_register); @@ -220,24 +259,25 @@ EXPORT_SYMBOL_GPL(rtnl_register); */ int rtnl_unregister(int protocol, int msgtype) { - struct rtnl_link *handlers; + struct rtnl_link **tab, *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); - if (!handlers) { + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + if (!tab) { rtnl_unlock(); return -ENOENT; } - handlers[msgindex].doit = NULL; - handlers[msgindex].dumpit = NULL; - handlers[msgindex].flags = 0; + link = tab[msgindex]; + rcu_assign_pointer(tab[msgindex], NULL); rtnl_unlock(); + kfree_rcu(link, rcu); + return 0; } EXPORT_SYMBOL_GPL(rtnl_unregister); @@ -251,20 +291,29 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { - struct rtnl_link *handlers; + struct rtnl_link **tab, *link; + int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); + tab = rtnl_msg_handlers[protocol]; RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); + for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { + link = tab[msgindex]; + if (!link) + continue; + + rcu_assign_pointer(tab[msgindex], NULL); + kfree_rcu(link, rcu); + } rtnl_unlock(); synchronize_net(); while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1) schedule(); - kfree(handlers); + kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); @@ -2973,18 +3022,26 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { + struct rtnl_link **tab; int type = cb->nlh->nlmsg_type-RTM_BASE; - struct rtnl_link *handlers; + struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; - handlers = rtnl_dereference(rtnl_msg_handlers[idx]); - if (!handlers) + if (type < 0 || type >= RTM_NR_MSGTYPES) continue; - dumpit = READ_ONCE(handlers[type].dumpit); + tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); + if (!tab) + continue; + + link = tab[type]; + if (!link) + continue; + + dumpit = link->dumpit; if (!dumpit) continue; @@ -4314,7 +4371,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct rtnl_link *handlers; + struct rtnl_link *link; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; @@ -4338,32 +4395,20 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (family >= ARRAY_SIZE(rtnl_msg_handlers)) - family = PF_UNSPEC; - rcu_read_lock(); - handlers = rcu_dereference(rtnl_msg_handlers[family]); - if (!handlers) { - family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[family]); - } - if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { struct sock *rtnl; rtnl_dumpit_func dumpit; u16 min_dump_alloc = 0; - dumpit = READ_ONCE(handlers[type].dumpit); - if (!dumpit) { + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) { family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]); - if (!handlers) - goto err_unlock; - - dumpit = READ_ONCE(handlers[type].dumpit); - if (!dumpit) + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) goto err_unlock; } + dumpit = link->dumpit; refcount_inc(&rtnl_msg_handlers_ref[family]); @@ -4384,33 +4429,36 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } - doit = READ_ONCE(handlers[type].doit); - if (!doit) { + link = rtnl_get_link(family, type); + if (!link || !link->doit) { family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[family]); + link = rtnl_get_link(PF_UNSPEC, type); + if (!link || !link->doit) + goto out_unlock; } - flags = READ_ONCE(handlers[type].flags); + flags = link->flags; if (flags & RTNL_FLAG_DOIT_UNLOCKED) { refcount_inc(&rtnl_msg_handlers_ref[family]); - doit = READ_ONCE(handlers[type].doit); + doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); refcount_dec(&rtnl_msg_handlers_ref[family]); return err; } - rcu_read_unlock(); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[family]); - if (handlers) { - doit = READ_ONCE(handlers[type].doit); - if (doit) - err = doit(skb, nlh, extack); - } + link = rtnl_get_link(family, type); + if (link && link->doit) + err = link->doit(skb, nlh, extack); rtnl_unlock(); + + return err; + +out_unlock: + rcu_read_unlock(); return err; err_unlock: -- cgit From e4202511480da5f8e6870d8f6ecbb821aeaa8caf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 2 Dec 2017 21:44:06 +0100 Subject: rtnetlink: get reference on module before invoking handlers Add yet another rtnl_register function. It will be used by modules that can be removed. The passed module struct is used to prevent module unload while a netlink dump is in progress or when a DOIT_UNLOCKED doit callback is called. Cc: Peter Zijlstra Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 113 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 78 insertions(+), 35 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ff292d3f2c41..de6390365c90 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -62,6 +62,7 @@ struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; + struct module *owner; unsigned int flags; struct rcu_head rcu; }; @@ -129,7 +130,6 @@ EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ static struct rtnl_link __rcu **rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; -static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -159,27 +159,10 @@ static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) return tab[msgtype]; } -/** - * __rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) +static int rtnl_register_internal(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) { struct rtnl_link **tab, *link, *old; int msgindex; @@ -210,6 +193,9 @@ int __rtnl_register(int protocol, int msgtype, goto unlock; } + WARN_ON(link->owner && link->owner != owner); + link->owner = owner; + WARN_ON(doit && link->doit && link->doit != doit); if (doit) link->doit = doit; @@ -228,6 +214,54 @@ unlock: rtnl_unlock(); return ret; } + +/** + * rtnl_register_module - Register a rtnetlink message type + * + * @owner: module registering the hook (THIS_MODULE) + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * + * Like rtnl_register, but for use by removable modules. + */ +int rtnl_register_module(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) +{ + return rtnl_register_internal(owner, protocol, msgtype, + doit, dumpit, flags); +} +EXPORT_SYMBOL_GPL(rtnl_register_module); + +/** + * __rtnl_register - Register a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_register(int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) +{ + return rtnl_register_internal(NULL, protocol, msgtype, + doit, dumpit, flags); +} EXPORT_SYMBOL_GPL(__rtnl_register); /** @@ -311,8 +345,6 @@ void rtnl_unregister_all(int protocol) synchronize_net(); - while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1) - schedule(); kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); @@ -4372,6 +4404,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct rtnl_link *link; + struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; @@ -4408,24 +4441,32 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (!link || !link->dumpit) goto err_unlock; } + owner = link->owner; dumpit = link->dumpit; - refcount_inc(&rtnl_msg_handlers_ref[family]); - if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); + err = 0; + /* need to do this before rcu_read_unlock() */ + if (!try_module_get(owner)) + err = -EPROTONOSUPPORT; + rcu_read_unlock(); rtnl = net->rtnl; - { + if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, + .module = owner, }; err = netlink_dump_start(rtnl, skb, nlh, &c); + /* netlink_dump_start() will keep a reference on + * module if dump is still in progress. + */ + module_put(owner); } - refcount_dec(&rtnl_msg_handlers_ref[family]); return err; } @@ -4437,14 +4478,19 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, goto out_unlock; } + owner = link->owner; + if (!try_module_get(owner)) { + err = -EPROTONOSUPPORT; + goto out_unlock; + } + flags = link->flags; if (flags & RTNL_FLAG_DOIT_UNLOCKED) { - refcount_inc(&rtnl_msg_handlers_ref[family]); doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); - refcount_dec(&rtnl_msg_handlers_ref[family]); + module_put(owner); return err; } rcu_read_unlock(); @@ -4455,6 +4501,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, err = link->doit(skb, nlh, extack); rtnl_unlock(); + module_put(owner); + return err; out_unlock: @@ -4546,11 +4594,6 @@ static struct pernet_operations rtnetlink_net_ops = { void __init rtnetlink_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++) - refcount_set(&rtnl_msg_handlers_ref[i], 1); - if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); -- cgit From 16feebcf2350aa369001a529f50ce33f2472c01c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 2 Dec 2017 21:44:08 +0100 Subject: rtnetlink: remove __rtnl_register This removes __rtnl_register and switches callers to either rtnl_register or rtnl_register_module. Also, rtnl_register() will now print an error if memory allocation failed rather than panic the kernel. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index de6390365c90..fb2d61df1e2f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -238,7 +238,7 @@ int rtnl_register_module(struct module *owner, EXPORT_SYMBOL_GPL(rtnl_register_module); /** - * __rtnl_register - Register a rtnetlink message type + * rtnl_register - Register a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message @@ -252,35 +252,18 @@ EXPORT_SYMBOL_GPL(rtnl_register_module); * The special protocol family PF_UNSPEC may be used to define fallback * function pointers for the case when no entry for the specific protocol * family exists. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - return rtnl_register_internal(NULL, protocol, msgtype, - doit, dumpit, flags); -} -EXPORT_SYMBOL_GPL(__rtnl_register); - -/** - * rtnl_register - Register a rtnetlink message type - * - * Identical to __rtnl_register() but panics on failure. This is useful - * as failure of this function is very unlikely, it can only happen due - * to lack of memory when allocating the chain to store all message - * handlers for a protocol. Meant for use in init functions where lack - * of memory implies no sense in continuing. */ void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { - if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0) - panic("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", - protocol, msgtype); + int err; + + err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, + flags); + if (err) + pr_err("Unable to register rtnetlink message handler, " + "protocol = %d, message type = %d\n", protocol, msgtype); } EXPORT_SYMBOL_GPL(rtnl_register); -- cgit From a3fde2addd5f0218b64102005a237ef727b0dc30 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Dec 2017 19:19:18 +0100 Subject: rtnetlink: ipv6: convert remaining users to rtnl_register_module convert remaining users of rtnl_register to rtnl_register_module and un-export rtnl_register. Requested-by: David S. Miller Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a55d0c236b40..642b3afb12b9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -265,7 +265,6 @@ void rtnl_register(int protocol, int msgtype, pr_err("Unable to register rtnetlink message handler, " "protocol = %d, message type = %d\n", protocol, msgtype); } -EXPORT_SYMBOL_GPL(rtnl_register); /** * rtnl_unregister - Unregister a rtnetlink message type -- cgit From 792f3dd6f02b25afc0b42adce2efd523e81c3fb1 Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 4 Dec 2017 14:18:29 -0800 Subject: bpf: move bpf csum flag check trivial move the BPF_F_ZERO_CSUM_TX check right below the 'flags & BPF_F_DONT_FRAGMENT', so common tun_flags handling is logically together. Signed-off-by: William Tu Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- net/core/filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 8ec5a504eb28..4d644ad17457 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3013,6 +3013,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; if (flags & BPF_F_DONT_FRAGMENT) info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -3028,9 +3030,6 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); } - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; - return 0; } -- cgit From f19397a5c65665d66e3866b42056f1f58b7a366b Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 1 Dec 2017 10:15:04 -0800 Subject: bpf: Add access to snd_cwnd and others in sock_ops Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these fields are only valid if the socket associated with the sock_ops program call is a full socket, the field is_fullsock is also added to the bpf_sock_ops struct. If the socket is not a full socket, reading these fields returns 0. Note that in most cases it will not be necessary to check is_fullsock to know if there is a full socket. The context of the call, as specified by the 'op' field, can sometimes determine whether there is a full socket. The struct bpf_sock_ops has the following fields added: __u32 is_fullsock; /* Some TCP fields are only valid if * there is a full socket. If not, the * fields read as zero. */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add read access to more 32 bit tcp_sock fields. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 4d644ad17457..754abe1041b7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4437,6 +4437,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct bpf_sock_ops, is_fullsock): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + break; + +/* Helper macro for adding read access to tcp_sock fields. */ +#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ + offsetof(struct tcp_sock, FIELD_NAME)); \ + } while (0) + + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_TCP32(snd_cwnd); + break; + + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_TCP32(srtt_us); + break; } return insn - insn_buf; } -- cgit From b0e9fe1ba7f8305dc1c640fbeb1b8c5c609e604c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Dec 2017 22:42:30 +0100 Subject: rtnetlink: fix rtnl_link msghandler rcu annotations Incorrect/missing annotations caused a few sparse warnings: rtnetlink.c:155:15: incompatible types .. (different address spaces) rtnetlink.c:157:23: incompatible types .. (different address spaces) rtnetlink.c:185:15: incompatible types .. (different address spaces) rtnetlink.c:285:15: incompatible types .. (different address spaces) rtnetlink.c:317:9: incompatible types .. (different address spaces) rtnetlink.c:3054:23: incompatible types .. (different address spaces) no change in generated code. Fixes: addf9b90de22f7 ("net: rtnetlink: use rcu to free rtnl message handlers") Reported-by: kbuild test robot Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 642b3afb12b9..a4faefd65006 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -129,7 +129,7 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link __rcu **rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; +static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -164,7 +164,8 @@ static int rtnl_register_internal(struct module *owner, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { - struct rtnl_link **tab, *link, *old; + struct rtnl_link *link, *old; + struct rtnl_link __rcu **tab; int msgindex; int ret = -ENOBUFS; -- cgit From 62b32379fd124fea521484ba7e220d8a449f0b59 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 4 Dec 2017 11:31:48 +0100 Subject: flow_dissector: dissect tunnel info outside __skb_flow_dissect() Move dissection of tunnel info to outside of the main flow dissection function, __skb_flow_dissect(). The sole user of this feature, the flower classifier, is updated to call tunnel info dissection directly, using skb_flow_dissect_tunnel_info(). This results in a slightly less complex implementation of __skb_flow_dissect(), in particular removing logic from that call path which is not used by the majority of users. The expense of this is borne by the flower classifier which now has to make an extra call for tunnel info dissection. This patch should not result in any behavioural change. Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 15ce30063765..cc75488d3653 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -133,10 +133,10 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, ctrl->addr_type = type; } -static void -__skb_flow_dissect_tunnel_info(const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - void *target_container) +void +skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) { struct ip_tunnel_info *info; struct ip_tunnel_key *key; @@ -212,6 +212,7 @@ __skb_flow_dissect_tunnel_info(const struct sk_buff *skb, tp->dst = key->tp_dst; } } +EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, @@ -576,9 +577,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - __skb_flow_dissect_tunnel_info(skb, flow_dissector, - target_container); - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); -- cgit From 6c148184b5c868ad2c8a5a4a777cd8097622368a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:54:06 -0800 Subject: net: sched: cleanup qdisc_run and __qdisc_run semantics Currently __qdisc_run calls qdisc_run_end() but does not call qdisc_run_begin(). This makes it hard to track pairs of qdisc_run_{begin,end} across function calls. To simplify reading these code paths this patch moves begin/end calls into qdisc_run(). Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6bea8931bb62..44c7de365f55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3192,9 +3192,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); - } else - qdisc_run_end(q); + } + qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; @@ -3204,6 +3204,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); + qdisc_run_end(q); } } spin_unlock(root_lock); -- cgit From 6b3ba9146fe64b9bebb6346c9dcfe3b4851de2d7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:54:25 -0800 Subject: net: sched: allow qdiscs to handle locking This patch adds a flag for queueing disciplines to indicate the stack does not need to use the qdisc lock to protect operations. This can be used to build lockless scheduling algorithms and improving performance. The flag is checked in the tx path and the qdisc lock is only taken if it is not set. For now use a conditional if statement. Later we could be more aggressive if it proves worthwhile and use a static key or wrap this in a likely(). Also the lockless case drops the TCQ_F_CAN_BYPASS logic. The reason for this is synchronizing a qlen counter across threads proves to cost more than doing the enqueue/dequeue operations when tested with pktgen. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/core/dev.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 44c7de365f55..e32cf5c7f200 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3162,6 +3162,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, int rc; qdisc_calculate_pkt_len(skb, q); + + if (q->flags & TCQ_F_NOLOCK) { + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + __qdisc_drop(skb, &to_free); + rc = NET_XMIT_DROP; + } else { + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + __qdisc_run(q); + } + + if (unlikely(to_free)) + kfree_skb_list(to_free); + return rc; + } + /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -4144,19 +4159,22 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) while (head) { struct Qdisc *q = head; - spinlock_t *root_lock; + spinlock_t *root_lock = NULL; head = head->next_sched; - root_lock = qdisc_lock(q); - spin_lock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); } } } -- cgit From b01ac095c740fc21f4bb21abe900b0f5b3042cf9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:57:20 -0800 Subject: net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq The sch_mq qdisc creates a sub-qdisc per tx queue which are then called independently for enqueue and dequeue operations. However statistics are aggregated and pushed up to the "master" qdisc. This patch adds support for any of the sub-qdiscs to be per cpu statistic qdiscs. To handle this case add a check when calculating stats and aggregate the per cpu stats if needed. Also exports __gnet_stats_copy_queue() to use as a helper function. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/core/gen_stats.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 87f28557b329..b2b2323bdc84 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -252,10 +252,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q, + __u32 qlen) { if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); @@ -269,6 +269,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, qstats->qlen = qlen; } +EXPORT_SYMBOL(__gnet_stats_copy_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV -- cgit From 46e6b992c2502b094e61da6994f1363f3b7c1413 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 7 Dec 2017 15:40:19 -0800 Subject: rtnetlink: allow GSO maximums to be set on device creation Netlink device already allows changing GSO sizes with ip set command. The part that is missing is allowing overriding GSO settings on device creation. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a4faefd65006..412ebf0b09c6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1637,6 +1637,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, @@ -2287,6 +2289,34 @@ static int do_setlink(const struct sk_buff *skb, } } + if (tb[IFLA_GSO_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); + + if (max_size > GSO_MAX_SIZE) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_size ^ max_size) { + netif_set_gso_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GSO_MAX_SEGS]) { + u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + + if (max_segs > GSO_MAX_SEGS) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_segs ^ max_segs) { + dev->gso_max_segs = max_segs; + status |= DO_SETLINK_MODIFIED; + } + } + if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); @@ -2651,6 +2681,10 @@ struct net_device *rtnl_create_link(struct net *net, dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + if (tb[IFLA_GSO_MAX_SIZE]) + netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); + if (tb[IFLA_GSO_MAX_SEGS]) + dev->gso_max_size = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); return dev; } -- cgit From a0b586fa75a69578ecf10b40582eed9b35de2432 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 8 Dec 2017 15:34:13 -0800 Subject: rtnetlink: fix typo in GSO max segments Fixes: 46e6b992c250 ("rtnetlink: allow GSO maximums to be set on device creation") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 412ebf0b09c6..c688dc564b11 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2684,7 +2684,7 @@ struct net_device *rtnl_create_link(struct net *net, if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) - dev->gso_max_size = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); return dev; } -- cgit From 8d74e9f88d65af8bb2e095aff506aa6eac755ada Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 Dec 2017 11:39:04 -0500 Subject: net: avoid skb_warn_bad_offload on IS_ERR skb_warn_bad_offload warns when packets enter the GSO stack that require skb_checksum_help or vice versa. Do not warn on arbitrary bad packets. Packet sockets can craft many. Syzkaller was able to demonstrate another one with eth_type games. In particular, suppress the warning when segmentation returns an error, which is for reasons other than checksum offload. See also commit 36c92474498a ("net: WARN if skb_checksum_help() is called on skb requiring segmentation") for context on this warning. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 8aa2f70995e8..b0eee49a2489 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2803,7 +2803,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, segs = skb_mac_gso_segment(skb, features); - if (unlikely(skb_needs_check(skb, tx_path))) + if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; -- cgit From c060bc6115b6a204cb60e6b03fe64135731bc6c8 Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Tue, 19 Dec 2017 07:17:15 +0800 Subject: bpf: make function xdp_do_generic_redirect_map() static The function xdp_do_generic_redirect_map() is only used in this file, so make it static. Clean up sparse warning: net/core/filter.c:2687:5: warning: no previous prototype for 'xdp_do_generic_redirect_map' [-Wmissing-prototypes] Signed-off-by: Xiongwei Song Signed-off-by: Daniel Borkmann --- net/core/filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 754abe1041b7..130b842c3a15 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2684,8 +2684,9 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) return 0; } -int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) +static int xdp_do_generic_redirect_map(struct net_device *dev, + struct sk_buff *skb, + struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; -- cgit From 08fc7f8140730d2f8499c91b5abad44581b74635 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 14 Dec 2017 05:51:57 -0800 Subject: sock: Change the netns_core member name. Change the member name will make the code more readable. This patch will be used in next patch. Signed-off-by: Martin Zhang Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/core/sock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index c0b5b2f17412..c2dd2d339db7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3045,7 +3045,7 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); + __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); @@ -3055,7 +3055,7 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot) int res = 0; for_each_possible_cpu(cpu) - res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; + res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } @@ -3063,13 +3063,13 @@ EXPORT_SYMBOL_GPL(sock_prot_inuse_get); static int __net_init sock_inuse_init_net(struct net *net) { - net->core.inuse = alloc_percpu(struct prot_inuse); - return net->core.inuse ? 0 : -ENOMEM; + net->core.prot_inuse = alloc_percpu(struct prot_inuse); + return net->core.prot_inuse ? 0 : -ENOMEM; } static void __net_exit sock_inuse_exit_net(struct net *net) { - free_percpu(net->core.inuse); + free_percpu(net->core.prot_inuse); } static struct pernet_operations net_inuse_ops = { -- cgit From 648845ab7e200993dccd3948c719c858368c91e7 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 14 Dec 2017 05:51:58 -0800 Subject: sock: Move the socket inuse to namespace. In some case, we want to know how many sockets are in use in different _net_ namespaces. It's a key resource metric. This patch add a member in struct netns_core. This is a counter for socket-inuse in the _net_ namespace. The patch will add/sub counter in the sk_alloc, sk_clone_lock and __sk_free. This patch will not counter the socket created in kernel. It's not very useful for userspace to know how many kernel sockets we created. The main reasons for doing this are that: 1. When linux calls the 'do_exit' for process to exit, the functions 'exit_task_namespaces' and 'exit_task_work' will be called sequentially. 'exit_task_namespaces' may have destroyed the _net_ namespace, but 'sock_release' called in 'exit_task_work' may use the _net_ namespace if we counter the socket-inuse in sock_release. 2. socket and sock are in pair. More important, sock holds the _net_ namespace. We counter the socket-inuse in sock, for avoiding holding _net_ namespace again in socket. It's a easy way to maintain the code. Signed-off-by: Martin Zhang Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/core/sock.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index c2dd2d339db7..72d14b221784 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -145,6 +145,8 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); +static void sock_inuse_add(struct net *net, int val); + /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through @@ -1531,8 +1533,11 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; - if (likely(sk->sk_net_refcnt)) + if (likely(sk->sk_net_refcnt)) { get_net(net); + sock_inuse_add(net, 1); + } + sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); @@ -1595,6 +1600,9 @@ void sk_destruct(struct sock *sk) static void __sk_free(struct sock *sk) { + if (likely(sk->sk_net_refcnt)) + sock_inuse_add(sock_net(sk), -1); + if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) sock_diag_broadcast_destroy(sk); else @@ -1716,6 +1724,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + if (likely(newsk->sk_net_refcnt)) + sock_inuse_add(sock_net(newsk), 1); /* * Before updating sk_refcnt, we must commit prior changes to memory @@ -3061,15 +3071,44 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot) } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); +static void sock_inuse_add(struct net *net, int val) +{ + this_cpu_add(*net->core.sock_inuse, val); +} + +int sock_inuse_get(struct net *net) +{ + int cpu, res = 0; + + for_each_possible_cpu(cpu) + res += *per_cpu_ptr(net->core.sock_inuse, cpu); + + return res; +} + +EXPORT_SYMBOL_GPL(sock_inuse_get); + static int __net_init sock_inuse_init_net(struct net *net) { net->core.prot_inuse = alloc_percpu(struct prot_inuse); - return net->core.prot_inuse ? 0 : -ENOMEM; + if (net->core.prot_inuse == NULL) + return -ENOMEM; + + net->core.sock_inuse = alloc_percpu(int); + if (net->core.sock_inuse == NULL) + goto out; + + return 0; + +out: + free_percpu(net->core.prot_inuse); + return -ENOMEM; } static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.prot_inuse); + free_percpu(net->core.sock_inuse); } static struct pernet_operations net_inuse_ops = { @@ -3112,6 +3151,10 @@ static inline void assign_proto_idx(struct proto *prot) static inline void release_proto_idx(struct proto *prot) { } + +static void sock_inuse_add(struct net *net, int val) +{ +} #endif static void req_prot_cleanup(struct request_sock_ops *rsk_prot) -- cgit From fb1f5f79ae96331a0201b4080d34f3bc3b5c0b1d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 16 Dec 2017 03:09:40 -0500 Subject: net: Introduce NETIF_F_GRO_HW. Introduce NETIF_F_GRO_HW feature flag for NICs that support hardware GRO. With this flag, we can now independently turn on or off hardware GRO when GRO is on. Previously, drivers were using NETIF_F_GRO to control hardware GRO and so it cannot be independently turned on or off without affecting GRO. Hardware GRO (just like GRO) guarantees that packets can be re-segmented by TSO/GSO to reconstruct the original packet stream. Logically, GRO_HW should depend on GRO since it a subset, but we will let individual drivers enforce this dependency as they see fit. Since NETIF_F_GRO is not propagated between upper and lower devices, NETIF_F_GRO_HW should follow suit since it is a subset of GRO. In other words, a lower device can independent have GRO/GRO_HW enabled or disabled and no feature propagation is required. This will preserve the current GRO behavior. This can be changed later if we decide to propagate GRO/ GRO_HW/RXCSUM from upper to lower devices. Cc: Ariel Elior Cc: everest-linux-l2@cavium.com Signed-off-by: Michael Chan Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++++++++ net/core/ethtool.c | 1 + 2 files changed, 13 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b0eee49a2489..4b43f5dcabcd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7424,6 +7424,18 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~dev->gso_partial_features; } + if (!(features & NETIF_F_RXCSUM)) { + /* NETIF_F_GRO_HW implies doing RXCSUM since every packet + * successfully merged by hardware must also have the + * checksum verified by hardware. If the user does not + * want to enable RXCSUM, logically, we should disable GRO_HW. + */ + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); + features &= ~NETIF_F_GRO_HW; + } + } + return features; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f8fcf450a36e..50a79203043b 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -73,6 +73,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_LLTX_BIT] = "tx-lockless", [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", [NETIF_F_LRO_BIT] = "rx-lro", [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", -- cgit From 56f5aa77cdad1076bea0ae8ddeb74ba68ddc9502 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 16 Dec 2017 03:09:41 -0500 Subject: net: Disable GRO_HW when generic XDP is installed on a device. Hardware should not aggregate any packets when generic XDP is installed. Cc: Ariel Elior Cc: everest-linux-l2@cavium.com Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- net/core/dev.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4b43f5dcabcd..c7db39926769 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1542,6 +1542,23 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); +/** + * dev_disable_gro_hw - disable HW Generic Receive Offload on a device + * @dev: device + * + * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be + * called under RTNL. This is needed if Generic XDP is installed on + * the device. + */ +static void dev_disable_gro_hw(struct net_device *dev) +{ + dev->wanted_features &= ~NETIF_F_GRO_HW; + netdev_update_features(dev); + + if (unlikely(dev->features & NETIF_F_GRO_HW)) + netdev_WARN(dev, "failed to disable GRO_HW!\n"); +} + static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { @@ -4564,6 +4581,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) } else if (new && !old) { static_key_slow_inc(&generic_xdp_needed); dev_disable_lro(dev); + dev_disable_gro_hw(dev); } break; -- cgit From 3dca3f38cfb8efb8571040568cac7d0025fa5bb1 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 20 Dec 2017 10:41:31 +0100 Subject: xfrm: Separate ESP handling from segmentation for GRO packets. We change the ESP GSO handlers to only segment the packets. The ESP handling and encryption is defered to validate_xmit_xfrm() where this is done for non GRO packets too. This makes the code more robust and prepares for asynchronous crypto handling. Signed-off-by: Steffen Klassert --- net/core/dev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c7db39926769..fb7a24a373d1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3083,9 +3083,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device __skb_linearize(skb)) goto out_kfree_skb; - if (validate_xmit_xfrm(skb, features)) - goto out_kfree_skb; - /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. @@ -3102,6 +3099,8 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device } } + skb = validate_xmit_xfrm(skb, features); + return skb; out_kfree_skb: -- cgit From f53c723902d1ac5f0b0a11d7c9dcbff748dde74e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 20 Dec 2017 10:41:36 +0100 Subject: net: Add asynchronous callbacks for xfrm on layer 2. This patch implements asynchronous crypto callbacks and a backlog handler that can be used when IPsec is done at layer 2 in the TX path. It also extends the skb validate functions so that we can update the driver transmit return codes based on async crypto operation or to indicate that we queued the packet in a backlog queue. Joint work with: Aviv Heller Signed-off-by: Steffen Klassert --- net/core/dev.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index fb7a24a373d1..821dd8cb7169 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3059,7 +3059,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, } EXPORT_SYMBOL(skb_csum_hwoffload_help); -static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; @@ -3099,7 +3099,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device } } - skb = validate_xmit_xfrm(skb, features); + skb = validate_xmit_xfrm(skb, features, again); return skb; @@ -3110,7 +3110,7 @@ out_null: return NULL; } -struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) { struct sk_buff *next, *head = NULL, *tail; @@ -3121,7 +3121,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d /* in case skb wont be segmented, point to itself */ skb->prev = skb; - skb = validate_xmit_skb(skb, dev); + skb = validate_xmit_skb(skb, dev, again); if (!skb) continue; @@ -3448,6 +3448,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; + bool again = false; skb_reset_mac_header(skb); @@ -3509,7 +3510,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) XMIT_RECURSION_LIMIT)) goto recursion_alert; - skb = validate_xmit_skb(skb, dev); + skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out; @@ -4193,6 +4194,8 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) spin_unlock(root_lock); } } + + xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) @@ -8874,6 +8877,9 @@ static int __init net_dev_init(void) skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); +#ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +#endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS -- cgit From 5b0890a97204627d75a333fc30f29f737e2bfad6 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 21 Dec 2017 10:17:42 +0100 Subject: flow_dissector: Parse batman-adv unicast headers The batman-adv unicast packets contain a full layer 2 frame in encapsulated form. The flow dissector must therefore be able to parse the batman-adv unicast header to reach the layer 2+3 information. +--------------------+ | ip(v6)hdr | +--------------------+ | inner ethhdr | +--------------------+ | batadv unicast hdr | +--------------------+ | outer ethhdr | +--------------------+ The obtained information from the upper layer can then be used by RPS to schedule the processing on separate cores. This allows better distribution of multiple flows from the same neighbor to different cores. Signed-off-by: Sven Eckelmann Reviewed-by: Jiri Pirko Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index cc75488d3653..02db7b122a73 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -24,6 +24,7 @@ #include #include #include +#include static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -437,6 +438,57 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, return FLOW_DISSECT_RET_PROTO_AGAIN; } +/** + * __skb_flow_dissect_batadv() - dissect batman-adv header + * @skb: sk_buff to with the batman-adv header + * @key_control: flow dissectors control key + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @p_proto: pointer used to update the protocol to process next + * @p_nhoff: pointer used to update inner network header offset + * @hlen: packet header length + * @flags: any combination of FLOW_DISSECTOR_F_* + * + * ETH_P_BATMAN packets are tried to be dissected. Only + * &struct batadv_unicast packets are actually processed because they contain an + * inner ethernet header and are usually followed by actual network header. This + * allows the flow dissector to continue processing the packet. + * + * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, + * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, + * otherwise FLOW_DISSECT_RET_OUT_BAD + */ +static enum flow_dissect_ret +__skb_flow_dissect_batadv(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + void *data, __be16 *p_proto, int *p_nhoff, int hlen, + unsigned int flags) +{ + struct { + struct batadv_unicast_packet batadv_unicast; + struct ethhdr eth; + } *hdr, _hdr; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) + return FLOW_DISSECT_RET_OUT_BAD; + + *p_proto = hdr->eth.h_proto; + *p_nhoff += sizeof(*hdr); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_PROTO_AGAIN; +} + static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -815,6 +867,11 @@ proto_again: nhoff, hlen); break; + case htons(ETH_P_BATMAN): + fdret = __skb_flow_dissect_batadv(skb, key_control, data, + &proto, &nhoff, hlen, flags); + break; + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; -- cgit From bf5c25d608613eaf4dcdba5a9cac5b2afe67d635 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Dec 2017 19:00:17 -0500 Subject: skbuff: in skb_segment, call zerocopy functions once per nskb This is a net-next follow-up to commit 268b79067942 ("skbuff: orphan frags before zerocopy clone"), which fixed a bug in net, but added a call to skb_zerocopy_clone at each frag to do so. When segmenting skbs with user frags, either the user frags must be replaced with private copies and uarg released, or the uarg must have its refcount increased for each new skb. skb_orphan_frags does the first, except for cases that can handle reference counting. skb_zerocopy_clone then does the second. Call these once per nskb, instead of once per frag. That is, in the common case. With a frag list, also refresh when the origin skb (frag_skb) changes. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a3cb0be4c6f3..00b0757830e2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3656,6 +3656,10 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; + if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + goto err; + while (pos < offset + len) { if (i >= nfrags) { BUG_ON(skb_headlen(list_skb)); @@ -3667,6 +3671,11 @@ normal: BUG_ON(!nfrags); + if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, frag_skb, + GFP_ATOMIC)) + goto err; + list_skb = list_skb->next; } @@ -3678,11 +3687,6 @@ normal: goto err; } - if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) - goto err; - if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) - goto err; - *nskb_frag = *frag; __skb_frag_ref(nskb_frag); size = skb_frag_size(nskb_frag); -- cgit From aecd67b60722dd24353b0bc50e78a55b30707dcd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:25:13 +0100 Subject: xdp: base API for new XDP rx-queue info concept This patch only introduce the core data structures and API functions. All XDP enabled drivers must use the API before this info can used. There is a need for XDP to know more about the RX-queue a given XDP frames have arrived on. For both the XDP bpf-prog and kernel side. Instead of extending xdp_buff each time new info is needed, the patch creates a separate read-mostly struct xdp_rxq_info, that contains this info. We stress this data/cache-line is for read-only info. This is NOT for dynamic per packet info, use the data_meta for such use-cases. The performance advantage is this info can be setup at RX-ring init time, instead of updating N-members in xdp_buff. A possible (driver level) micro optimization is that xdp_buff->rxq assignment could be done once per XDP/NAPI loop. The extra pointer deref only happens for program needing access to this info (thus, no slowdown to existing use-cases). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/Makefile | 2 +- net/core/xdp.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 net/core/xdp.c (limited to 'net/core') diff --git a/net/core/Makefile b/net/core/Makefile index 1fd0a9c88b1b..6dbbba8c57ae 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o + fib_notifier.o xdp.o obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o diff --git a/net/core/xdp.c b/net/core/xdp.c new file mode 100644 index 000000000000..229bc5a0ee04 --- /dev/null +++ b/net/core/xdp.c @@ -0,0 +1,67 @@ +/* net/core/xdp.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ +#include +#include + +#include + +#define REG_STATE_NEW 0x0 +#define REG_STATE_REGISTERED 0x1 +#define REG_STATE_UNREGISTERED 0x2 +#define REG_STATE_UNUSED 0x3 + +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) +{ + /* Simplify driver cleanup code paths, allow unreg "unused" */ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) + return; + + WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; + xdp_rxq->dev = NULL; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); + +static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) +{ + memset(xdp_rxq, 0, sizeof(*xdp_rxq)); +} + +/* Returns 0 on success, negative on failure */ +int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index) +{ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { + WARN(1, "Driver promised not to register this"); + return -EINVAL; + } + + if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { + WARN(1, "Missing unregister, handled but fix driver"); + xdp_rxq_info_unreg(xdp_rxq); + } + + if (!dev) { + WARN(1, "Missing net_device from driver"); + return -ENODEV; + } + + /* State either UNREGISTERED or NEW */ + xdp_rxq_info_init(xdp_rxq); + xdp_rxq->dev = dev; + xdp_rxq->queue_index = queue_index; + + xdp_rxq->reg_state = REG_STATE_REGISTERED; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg); + +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->reg_state = REG_STATE_UNUSED; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); -- cgit From c0124f327e5cabd844a10d7e1fc5aa2a81e796a9 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:25:34 +0100 Subject: xdp/qede: setup xdp_rxq_info and intro xdp_rxq_info_is_reg The driver code qede_free_fp_array() depend on kfree() can be called with a NULL pointer. This stems from the qede_alloc_fp_array() function which either (kz)alloc memory for fp->txq or fp->rxq. This also simplifies error handling code in case of memory allocation failures, but xdp_rxq_info_unreg need to know the difference. Introduce xdp_rxq_info_is_reg() to handle if a memory allocation fails and detect this is the failure path by seeing that xdp_rxq_info was not registred yet, which first happens after successful alloaction in qede_init_fp(). Driver hook points for xdp_rxq_info: * reg : qede_init_fp * unreg: qede_free_fp_array Tested on actual hardware with samples/bpf program. V2: Driver have no proper error path for failed XDP RX-queue info reg, as qede_init_fp() is a void function. Cc: everest-linux-l2@cavium.com Cc: Ariel Elior Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/xdp.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/xdp.c b/net/core/xdp.c index 229bc5a0ee04..097a0f74e004 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -65,3 +65,9 @@ void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) xdp_rxq->reg_state = REG_STATE_UNUSED; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); + +bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) +{ + return (xdp_rxq->reg_state == REG_STATE_REGISTERED); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); -- cgit From e817f85652c14d78f170b18797e4c477c78949e0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:26:09 +0100 Subject: xdp: generic XDP handling of xdp_rxq_info Hook points for xdp_rxq_info: * reg : netif_alloc_rx_queues * unreg: netif_free_rx_queues The net_device have some members (num_rx_queues + real_num_rx_queues) and data-area (dev->_rx with struct netdev_rx_queue's) that were primarily used for exporting information about RPS (CONFIG_RPS) queues to sysfs (CONFIG_SYSFS). For generic XDP extend struct netdev_rx_queue with the xdp_rxq_info, and remove some of the CONFIG_SYSFS ifdefs. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/dev.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2eb66c0d9cdb..d7925ef8743d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3906,9 +3906,33 @@ drop: return NET_RX_DROP; } +static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_rx_queue *rxqueue; + + rxqueue = dev->_rx; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + + return rxqueue; /* Return first rxqueue */ + } + rxqueue += index; + } + return rxqueue; +} + static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; struct xdp_buff xdp; void *orig_data; @@ -3952,6 +3976,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; + rxqueue = netif_get_rxqueue(skb); + xdp.rxq = &rxqueue->xdp_rxq; + act = bpf_prog_run_xdp(xdp_prog, &xdp); off = xdp.data - orig_data; @@ -7589,12 +7616,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); + int err = 0; BUG_ON(count < 1); @@ -7604,11 +7631,39 @@ static int netif_alloc_rx_queues(struct net_device *dev) dev->_rx = rx; - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { rx[i].dev = dev; + + /* XDP RX-queue setup */ + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + if (err < 0) + goto err_rxq_info; + } return 0; + +err_rxq_info: + /* Rollback successful reg's and free other resources */ + while (i--) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); + kfree(dev->_rx); + dev->_rx = NULL; + return err; +} + +static void netif_free_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; + + /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ + if (!dev->_rx) + return; + + rx = dev->_rx; + + for (i = 0; i < count; i++) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); } -#endif static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) @@ -8169,12 +8224,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_SYSFS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } -#endif alloc_size = sizeof(struct net_device); if (sizeof_priv) { @@ -8231,12 +8284,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; -#endif strcpy(dev->name, name); dev->name_assign_type = name_assign_type; @@ -8275,9 +8326,7 @@ void free_netdev(struct net_device *dev) might_sleep(); netif_free_tx_queues(dev); -#ifdef CONFIG_SYSFS - kvfree(dev->_rx); -#endif + netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); -- cgit From 02dd3291b2f095bbc88e1d2628fd5bf2e92de69b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:26:14 +0100 Subject: bpf: finally expose xdp_rxq_info to XDP bpf-programs Now all XDP driver have been updated to setup xdp_rxq_info and assign this to xdp_buff->rxq. Thus, it is now safe to enable access to some of the xdp_rxq_info struct members. This patch extend xdp_md and expose UAPI to userspace for ingress_ifindex and rx_queue_index. Access happens via bpf instruction rewrite, that load data directly from struct xdp_rxq_info. * ingress_ifindex map to xdp_rxq_info->dev->ifindex * rx_queue_index map to xdp_rxq_info->queue_index Signed-off-by: Jesper Dangaard Brouer Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 130b842c3a15..acdb94c0e97f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4304,6 +4304,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; + case offsetof(struct xdp_md, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct net_device, + ifindex, 4, target_size)); + break; + case offsetof(struct xdp_md, rx_queue_index): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct xdp_rxq_info, + queue_index, 4, target_size)); + break; } return insn - insn_buf; -- cgit From c5a9f6f0ab4054082dd5ce9bbdaa8e8ff05cf365 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Mon, 17 Jul 2017 13:47:07 +0300 Subject: net/core: Add drop counters to VF statistics Modern hardware can decide to drop packets going to/from a VF. Add receive and transmit drop counters to be displayed at hypervisor layer in iproute2 per VF statistics. Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- net/core/rtnetlink.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c688dc564b11..5421a3fd3ba1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -904,6 +904,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); return size; } else @@ -1258,7 +1262,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) { + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } -- cgit From 37e2d99b59c4765112533a1d38174fea58d28a51 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Mon, 8 Jan 2018 16:00:24 +0200 Subject: ethtool: Ensure new ring parameters are within bounds during SRINGPARAM Add a sanity check to ensure that all requested ring parameters are within bounds, which should reduce errors in driver implementation. Signed-off-by: Eugenia Emantayev Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- net/core/ethtool.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index fff6314f4c5e..107b122c8969 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1693,14 +1693,23 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam; + struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; - if (!dev->ethtool_ops->set_ringparam) + if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; + dev->ethtool_ops->get_ringparam(dev, &max); + + /* ensure new ring parameters are within the maximums */ + if (ringparam.rx_pending > max.rx_max_pending || + ringparam.rx_mini_pending > max.rx_mini_max_pending || + ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || + ringparam.tx_pending > max.tx_max_pending) + return -EINVAL; + return dev->ethtool_ops->set_ringparam(dev, &ringparam); } -- cgit From 141b52a98ab45a835ff1ea869414faccdc255a72 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 10 Jan 2018 01:20:01 -0800 Subject: net: use the right variant of kfree kvzalloc'ed memory should be kvfree'd. Fixes: e817f85652c1 ("xdp: generic XDP handling of xdp_rxq_info") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d7925ef8743d..852a54c769a3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7645,7 +7645,7 @@ err_rxq_info: /* Rollback successful reg's and free other resources */ while (i--) xdp_rxq_info_unreg(&rx[i].xdp_rxq); - kfree(dev->_rx); + kvfree(dev->_rx); dev->_rx = NULL; return err; } -- cgit From 82aaff2f63443e1d6cc4a186ed9c2a5718123906 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 10 Jan 2018 01:20:02 -0800 Subject: net: free RX queue structures Looks like commit e817f85652c1 ("xdp: generic XDP handling of xdp_rxq_info") replaced kvfree(dev->_rx) in free_netdev() with a call to netif_free_rx_queues() which doesn't actually free the rings? While at it remove the unnecessary temporary variable. Fixes: e817f85652c1 ("xdp: generic XDP handling of xdp_rxq_info") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- net/core/dev.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 852a54c769a3..74e1e5d31337 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7653,16 +7653,15 @@ err_rxq_info: static void netif_free_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; - struct netdev_rx_queue *rx; /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ if (!dev->_rx) return; - rx = dev->_rx; - for (i = 0; i < count; i++) - xdp_rxq_info_unreg(&rx[i].xdp_rxq); + xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); + + kvfree(dev->_rx); } static void netdev_init_one_queue(struct net_device *dev, -- cgit From d584527c70399cf0d095396d696029f54a10cfd3 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 22 Nov 2017 10:57:41 -0800 Subject: net: Cap number of queues even with accel_priv With the recent fix to ixgbe we can cap the number of queues always regardless of if accel_priv is being used or not since the actual number of queues are being reported via real_num_tx_queues. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3d24d9a59086..94435cd09072 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3420,8 +3420,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, else queue_index = __netdev_pick_tx(dev, skb); - if (!accel_priv) - queue_index = netdev_cap_txqueue(dev, queue_index); + queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); -- cgit From daaf24c634ab951cad3dcef28492001ef9c931d0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 11 Jan 2018 17:39:09 +0100 Subject: bpf: simplify xdp_convert_ctx_access for xdp_rxq_info As pointed out by Daniel Borkmann, using bpf_target_off() is not necessary for xdp_rxq_info when extracting queue_index and ifindex, as these members are u32 like BPF_W. Also fix trivial spelling mistake introduced in same commit. Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs") Reported-by: Daniel Borkmann Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- net/core/filter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index d4b190e63b79..db2ee8c7e1bd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4310,16 +4310,15 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - bpf_target_off(struct net_device, - ifindex, 4, target_size)); + offsetof(struct net_device, ifindex)); break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - bpf_target_off(struct xdp_rxq_info, - queue_index, 4, target_size)); + offsetof(struct xdp_rxq_info, + queue_index)); break; } -- cgit From 273c28bc57ca9672f7b70bed764ecdfb964930c8 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Jan 2018 18:28:31 +0300 Subject: net: Convert atomic_t net::count to refcount_t Since net could be obtained from RCU lists, and there is a race with net destruction, the patch converts net::count to refcount_t. This provides sanity checks for the cases of incrementing counter of already dead net, when maybe_get_net() has to used instead of get_net(). Drivers: allyesconfig and allmodconfig are OK. Suggested-by: Eric Dumazet Signed-off-by: Kirill Tkhai Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 6 +++--- net/core/net_namespace.c | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 799b75268291..7bf8b85ade16 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -961,7 +961,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct kobject *kobj = &dev->_rx[i].kobj; - if (!atomic_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -1367,7 +1367,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!atomic_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->count)) queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1558,7 +1558,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!atomic_read(&dev_net(ndev)->count)) + if (!refcount_read(&dev_net(ndev)->count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 60a71be75aea..2213d45fcafd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -35,7 +35,7 @@ LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); struct net init_net = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), }; EXPORT_SYMBOL(init_net); @@ -224,10 +224,10 @@ int peernet2id_alloc(struct net *net, struct net *peer) bool alloc; int id; - if (atomic_read(&net->count) == 0) + if (refcount_read(&net->count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); - alloc = atomic_read(&peer->count) == 0 ? false : true; + alloc = refcount_read(&peer->count) == 0 ? false : true; id = __peernet2id_alloc(net, peer, &alloc); spin_unlock_bh(&net->nsid_lock); if (alloc && id >= 0) @@ -284,7 +284,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) int error = 0; LIST_HEAD(net_exit_list); - atomic_set(&net->count, 1); + refcount_set(&net->count, 1); refcount_set(&net->passive, 1); net->dev_base_seq = 1; net->user_ns = user_ns; -- cgit From 2406e7e546b223e8cf42c44ac7352d4d1fd1dbcd Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:02 +0100 Subject: devlink: Add per devlink instance lock This is a preparation before introducing resources and hot reload support. Currently there are two global lock where one protects all devlink access, and the second one protects devlink port access. This patch adds per devlink instance lock which protects the internal members which are the sb/dpipe/ resource/ports. By introducing this lock the global devlink port lock can be discarded. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 136 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 77 insertions(+), 59 deletions(-) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 7d430c1d9c3e..2f71734c4ff6 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -92,12 +92,6 @@ static LIST_HEAD(devlink_list); */ static DEFINE_MUTEX(devlink_mutex); -/* devlink_port_mutex - * - * Shared lock to guard lists of ports in all devlink devices. - */ -static DEFINE_MUTEX(devlink_port_mutex); - static struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); @@ -335,15 +329,18 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) -#define DEVLINK_NL_FLAG_LOCK_PORTS BIT(3) - /* port is not needed but we need to ensure they don't - * change in the middle of command - */ + +/* The per devlink instance lock is taken by default in the pre-doit + * operation, yet several commands do not require this. The global + * devlink lock is taken and protects from disruption by user-calls. + */ +#define DEVLINK_NL_FLAG_NO_LOCK BIT(3) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink; + int err; mutex_lock(&devlink_mutex); devlink = devlink_get_from_info(info); @@ -351,44 +348,47 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, mutex_unlock(&devlink_mutex); return PTR_ERR(devlink); } + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_lock(&devlink->lock); if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { info->user_ptr[0] = devlink; } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { struct devlink_port *devlink_port; - mutex_lock(&devlink_port_mutex); devlink_port = devlink_port_get_from_info(devlink, info); if (IS_ERR(devlink_port)) { - mutex_unlock(&devlink_port_mutex); - mutex_unlock(&devlink_mutex); - return PTR_ERR(devlink_port); + err = PTR_ERR(devlink_port); + goto unlock; } info->user_ptr[0] = devlink_port; } - if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) { - mutex_lock(&devlink_port_mutex); - } if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) { struct devlink_sb *devlink_sb; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) { - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) - mutex_unlock(&devlink_port_mutex); - mutex_unlock(&devlink_mutex); - return PTR_ERR(devlink_sb); + err = PTR_ERR(devlink_sb); + goto unlock; } info->user_ptr[1] = devlink_sb; } return 0; + +unlock: + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_unlock(&devlink->lock); + mutex_unlock(&devlink_mutex); + return err; } static void devlink_nl_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT || - ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) - mutex_unlock(&devlink_port_mutex); + struct devlink *devlink; + + devlink = devlink_get_from_info(info); + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -614,10 +614,10 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_port, &devlink->port_list, list) { if (idx < start) { idx++; @@ -628,13 +628,15 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) + if (err) { + mutex_unlock(&devlink->lock); goto out; + } idx++; } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -801,6 +803,7 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { if (idx < start) { idx++; @@ -811,10 +814,13 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) + if (err) { + mutex_unlock(&devlink->lock); goto out; + } idx++; } + mutex_unlock(&devlink->lock); } out: mutex_unlock(&devlink_mutex); @@ -935,14 +941,18 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_pool_get) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_pool_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: mutex_unlock(&devlink_mutex); @@ -1123,22 +1133,24 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_port_pool_get) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_port_pool_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -1347,23 +1359,26 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_tc_pool_bind_get) continue; + + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -2322,14 +2337,16 @@ static const struct genl_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_port_split_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PORT_UNSPLIT, .doit = devlink_nl_cmd_port_unsplit_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_SB_GET, @@ -2397,8 +2414,7 @@ static const struct genl_ops devlink_nl_ops[] = { .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB | - DEVLINK_NL_FLAG_LOCK_PORTS, + DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, @@ -2406,8 +2422,7 @@ static const struct genl_ops devlink_nl_ops[] = { .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB | - DEVLINK_NL_FLAG_LOCK_PORTS, + DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_ESWITCH_GET, @@ -2488,6 +2503,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); + mutex_init(&devlink->lock); return devlink; } EXPORT_SYMBOL_GPL(devlink_alloc); @@ -2550,16 +2566,16 @@ int devlink_port_register(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index) { - mutex_lock(&devlink_port_mutex); + mutex_lock(&devlink->lock); if (devlink_port_index_exists(devlink, port_index)) { - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); return -EEXIST; } devlink_port->devlink = devlink; devlink_port->index = port_index; devlink_port->registered = true; list_add_tail(&devlink_port->list, &devlink->port_list); - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } @@ -2572,10 +2588,12 @@ EXPORT_SYMBOL_GPL(devlink_port_register); */ void devlink_port_unregister(struct devlink_port *devlink_port) { + struct devlink *devlink = devlink_port->devlink; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); - mutex_lock(&devlink_port_mutex); + mutex_lock(&devlink->lock); list_del(&devlink_port->list); - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_port_unregister); @@ -2651,7 +2669,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, struct devlink_sb *devlink_sb; int err = 0; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); if (devlink_sb_index_exists(devlink, sb_index)) { err = -EEXIST; goto unlock; @@ -2670,7 +2688,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, devlink_sb->egress_tc_count = egress_tc_count; list_add_tail(&devlink_sb->list, &devlink->sb_list); unlock: - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return err; } EXPORT_SYMBOL_GPL(devlink_sb_register); @@ -2679,11 +2697,11 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) { struct devlink_sb *devlink_sb; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink_sb = devlink_sb_get_by_index(devlink, sb_index); WARN_ON(!devlink_sb); list_del(&devlink_sb->list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); kfree(devlink_sb); } EXPORT_SYMBOL_GPL(devlink_sb_unregister); @@ -2699,9 +2717,9 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister); int devlink_dpipe_headers_register(struct devlink *devlink, struct devlink_dpipe_headers *dpipe_headers) { - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink->dpipe_headers = dpipe_headers; - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return 0; } EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); @@ -2715,9 +2733,9 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); */ void devlink_dpipe_headers_unregister(struct devlink *devlink) { - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink->dpipe_headers = NULL; - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister); @@ -2783,9 +2801,9 @@ int devlink_dpipe_table_register(struct devlink *devlink, table->priv = priv; table->counter_control_extern = counter_control_extern; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return 0; } EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); @@ -2801,17 +2819,17 @@ void devlink_dpipe_table_unregister(struct devlink *devlink, { struct devlink_dpipe_table *table; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name); if (!table) goto unlock; list_del_rcu(&table->list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); kfree_rcu(table, rcu); return; unlock: - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); -- cgit From d9f9b9a4d05fab693fd23a9ecaa330e03ebe2c31 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:03 +0100 Subject: devlink: Add support for resource abstraction Add support for hardware resource abstraction over devlink. Each resource is identified via id, furthermore it contains information regarding its size and its related sub resources. Each resource can also provide its current occupancy. In some cases the sizes of some resources can be changed, yet for those changes to take place a hot driver reload may be needed. The reload capability will be introduced in the next patch. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 374 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 2f71734c4ff6..89b3704fa450 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2288,6 +2288,233 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, counters_enable); } +struct devlink_resource * +devlink_resource_find(struct devlink *devlink, + struct devlink_resource *resource, u64 resource_id) +{ + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + struct devlink_resource *child_resource; + + if (resource->id == resource_id) + return resource; + + child_resource = devlink_resource_find(devlink, resource, + resource_id); + if (child_resource) + return child_resource; + } + return NULL; +} + +void devlink_resource_validate_children(struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + bool size_valid = true; + u64 parts_size = 0; + + if (list_empty(&resource->resource_list)) + goto out; + + list_for_each_entry(child_resource, &resource->resource_list, list) + parts_size += child_resource->size_new; + + if (parts_size > resource->size) + size_valid = false; +out: + resource->size_valid = size_valid; +} + +static int devlink_nl_cmd_resource_set(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + u64 resource_id; + u64 size; + int err; + + if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] || + !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]) + return -EINVAL; + resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) + return -EINVAL; + + if (!resource->resource_ops->size_validate) + return -EINVAL; + + size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); + err = resource->resource_ops->size_validate(devlink, size, + info->extack); + if (err) + return err; + + resource->size_new = size; + devlink_resource_validate_children(resource); + if (resource->parent) + devlink_resource_validate_children(resource->parent); + return 0; +} + +static void +devlink_resource_size_params_put(struct devlink_resource *resource, + struct sk_buff *skb) +{ + struct devlink_resource_size_params *size_params; + + size_params = resource->size_params; + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, + size_params->size_granularity, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, + size_params->size_max, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, + size_params->size_min, DEVLINK_ATTR_PAD); + nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit); +} + +static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, + struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + struct nlattr *child_resource_attr; + struct nlattr *resource_attr; + + resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE); + if (!resource_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size, + DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (resource->size != resource->size_new) + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, + resource->size_new, DEVLINK_ATTR_PAD); + if (resource->resource_ops && resource->resource_ops->occ_get) + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, + resource->resource_ops->occ_get(devlink), + DEVLINK_ATTR_PAD); + devlink_resource_size_params_put(resource, skb); + if (list_empty(&resource->resource_list)) + goto out; + + if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, + resource->size_valid)) + goto nla_put_failure; + + child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + if (!child_resource_attr) + goto nla_put_failure; + + list_for_each_entry(child_resource, &resource->resource_list, list) { + if (devlink_resource_put(devlink, skb, child_resource)) + goto resource_put_failure; + } + + nla_nest_end(skb, child_resource_attr); +out: + nla_nest_end(skb, resource_attr); + return 0; + +resource_put_failure: + nla_nest_cancel(skb, child_resource_attr); +nla_put_failure: + nla_nest_cancel(skb, resource_attr); + return -EMSGSIZE; +} + +static int devlink_resource_fill(struct genl_info *info, + enum devlink_command cmd, int flags) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + struct nlattr *resources_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + bool incomplete; + void *hdr; + int i; + int err; + + resource = list_first_entry(&devlink->resource_list, + struct devlink_resource, list); +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + + resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + if (!resources_attr) + goto nla_put_failure; + + incomplete = false; + i = 0; + list_for_each_entry_from(resource, &devlink->resource_list, list) { + err = devlink_resource_put(devlink, skb, resource); + if (err) { + if (!i) + goto err_resource_put; + incomplete = true; + break; + } + i++; + } + nla_nest_end(skb, resources_attr); + genlmsg_end(skb, hdr); + if (incomplete) + goto start_again; +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_resource_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + if (list_empty(&devlink->resource_list)) + return -EOPNOTSUPP; + + return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2306,6 +2533,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, + [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, + [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, }; static const struct genl_ops devlink_nl_ops[] = { @@ -2466,6 +2695,20 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_RESOURCE_SET, + .doit = devlink_nl_cmd_resource_set, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_RESOURCE_DUMP, + .doit = devlink_nl_cmd_resource_dump, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -2503,6 +2746,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); + INIT_LIST_HEAD(&devlink->resource_list); mutex_init(&devlink->lock); return devlink; } @@ -2833,6 +3077,136 @@ unlock: } EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); +/** + * devlink_resource_register - devlink resource register + * + * @devlink: devlink + * @resource_name: resource's name + * @top_hierarchy: top hierarchy + * @reload_required: reload is required for new configuration to + * apply + * @resource_size: resource's size + * @resource_id: resource's id + * @parent_reosurce_id: resource's parent id + * @size params: size parameters + * @resource_ops: resource ops + */ +int devlink_resource_register(struct devlink *devlink, + const char *resource_name, + bool top_hierarchy, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + struct devlink_resource_size_params *size_params, + const struct devlink_resource_ops *resource_ops) +{ + struct devlink_resource *resource; + struct list_head *resource_list; + int err = 0; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (resource) { + err = -EINVAL; + goto out; + } + + resource = kzalloc(sizeof(*resource), GFP_KERNEL); + if (!resource) { + err = -ENOMEM; + goto out; + } + + if (top_hierarchy) { + resource_list = &devlink->resource_list; + } else { + struct devlink_resource *parent_resource; + + parent_resource = devlink_resource_find(devlink, NULL, + parent_resource_id); + if (parent_resource) { + resource_list = &parent_resource->resource_list; + resource->parent = parent_resource; + } else { + err = -EINVAL; + goto out; + } + } + + resource->name = resource_name; + resource->size = resource_size; + resource->size_new = resource_size; + resource->id = resource_id; + resource->resource_ops = resource_ops; + resource->size_valid = true; + resource->size_params = size_params; + INIT_LIST_HEAD(&resource->resource_list); + list_add_tail(&resource->list, resource_list); +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_register); + +/** + * devlink_resources_unregister - free all resources + * + * @devlink: devlink + * @resource: resource + */ +void devlink_resources_unregister(struct devlink *devlink, + struct devlink_resource *resource) +{ + struct devlink_resource *tmp, *child_resource; + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + if (!resource) + mutex_lock(&devlink->lock); + + list_for_each_entry_safe(child_resource, tmp, resource_list, list) { + devlink_resources_unregister(devlink, child_resource); + list_del(&child_resource->list); + kfree(child_resource); + } + + if (!resource) + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_resources_unregister); + +/** + * devlink_resource_size_get - get and update size + * + * @devlink: devlink + * @resource_id: the requested resource id + * @p_resource_size: ptr to update + */ +int devlink_resource_size_get(struct devlink *devlink, + u64 resource_id, + u64 *p_resource_size) +{ + struct devlink_resource *resource; + int err = 0; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) { + err = -EINVAL; + goto out; + } + *p_resource_size = resource->size_new; + resource->size = resource->size_new; +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_size_get); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit From 2d8dc5bbf4e7603747875eb5cadcd67c1fa8b1bb Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:04 +0100 Subject: devlink: Add support for reload Add support for performing driver hot reload. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 89b3704fa450..4c3d85560436 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2515,6 +2515,45 @@ static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); } +static int +devlink_resources_validate(struct devlink *devlink, + struct devlink_resource *resource, + struct genl_info *info) +{ + struct list_head *resource_list; + int err = 0; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + if (!resource->size_valid) + return -EINVAL; + err = devlink_resources_validate(devlink, resource, info); + if (err) + return err; + } + return err; +} + +static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + int err; + + if (!devlink->ops->reload) + return -EOPNOTSUPP; + + err = devlink_resources_validate(devlink, NULL, info); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); + return err; + } + return devlink->ops->reload(devlink); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2709,6 +2748,14 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_RELOAD, + .doit = devlink_nl_cmd_reload, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit From 56dc7cd0a87a1ff4f49ee1e67bd88e768385d51a Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:05 +0100 Subject: devlink: Add relation between dpipe and resource The hardware processes which are modeled via dpipe commonly use some internal hardware resources. Such relation can improve the understanding of hardware limitations. The number of resource's unit consumed per table's entry are also provided for each table. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 4c3d85560436..dd7d6dd07bfb 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1694,6 +1694,12 @@ static int devlink_dpipe_table_put(struct sk_buff *skb, table->counters_enabled)) goto nla_put_failure; + if (table->resource_valid) { + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, + table->resource_id, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, + table->resource_units, DEVLINK_ATTR_PAD); + } if (devlink_dpipe_matches_put(table, skb)) goto nla_put_failure; @@ -3254,6 +3260,37 @@ out: } EXPORT_SYMBOL_GPL(devlink_resource_size_get); +/** + * devlink_dpipe_table_resource_set - set the resource id + * + * @devlink: devlink + * @table_name: table name + * @resource_id: resource id + * @resource_units: number of resource's units consumed per table's entry + */ +int devlink_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units) +{ + struct devlink_dpipe_table *table; + int err = 0; + + mutex_lock(&devlink->lock); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) { + err = -EINVAL; + goto out; + } + table->resource_id = resource_id; + table->resource_units = resource_units; + table->resource_valid = true; +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit From 96890d62523c2cddc2c053ad29de35c4d935cf11 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jan 2018 00:42:40 +0300 Subject: net: delete /proc THIS_MODULE references /proc has been ignoring struct file_operations::owner field for 10 years. Specifically, it started with commit 786d7e1612f0b0adb6046f19b906609e4fe8b1ba ("Fix rmmod/read/write races in /proc entries"). Notice the chunk where inode->i_fop is initialized with proxy struct file_operations for regular files: - if (de->proc_fops) - inode->i_fop = de->proc_fops; + if (de->proc_fops) { + if (S_ISREG(inode->i_mode)) + inode->i_fop = &proc_reg_file_ops; + else + inode->i_fop = de->proc_fops; + } VFS stopped pinning module at this point. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/neighbour.c | 1 - net/core/net-procfs.c | 4 ---- net/core/pktgen.c | 3 --- net/core/sock.c | 1 - 4 files changed, 9 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d1f5fe986edd..f96f9f58b894 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2862,7 +2862,6 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file) }; static const struct file_operations neigh_stat_seq_fops = { - .owner = THIS_MODULE, .open = neigh_stat_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 615ccab55f38..e010bb800d7b 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -182,7 +182,6 @@ static int dev_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, .open = dev_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -202,7 +201,6 @@ static int softnet_seq_open(struct inode *inode, struct file *file) } static const struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, .open = softnet_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -306,7 +304,6 @@ static int ptype_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ptype_seq_fops = { - .owner = THIS_MODULE, .open = ptype_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -387,7 +384,6 @@ static int dev_mc_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dev_mc_seq_fops = { - .owner = THIS_MODULE, .open = dev_mc_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b9ce241cd28c..4fcfcb14e7c6 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -523,7 +523,6 @@ static int pgctrl_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_fops = { - .owner = THIS_MODULE, .open = pgctrl_open, .read = seq_read, .llseek = seq_lseek, @@ -1804,7 +1803,6 @@ static int pktgen_if_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_if_fops = { - .owner = THIS_MODULE, .open = pktgen_if_open, .read = seq_read, .llseek = seq_lseek, @@ -1942,7 +1940,6 @@ static int pktgen_thread_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_thread_fops = { - .owner = THIS_MODULE, .open = pktgen_thread_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/sock.c b/net/core/sock.c index 72d14b221784..abf4cbff99b2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3362,7 +3362,6 @@ static int proto_seq_open(struct inode *inode, struct file *file) } static const struct file_operations proto_seq_fops = { - .owner = THIS_MODULE, .open = proto_seq_open, .read = seq_read, .llseek = seq_lseek, -- cgit From 0c06bea919f3289368a023d1a62a1bc319617fa3 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 16 Jan 2018 12:31:41 +0300 Subject: net: Fix possible race in peernet2id_alloc() peernet2id_alloc() is racy without rtnl_lock() as refcount_read(&peer->count) under net->nsid_lock does not guarantee, peer is alive: rcu_read_lock() peernet2id_alloc() .. spin_lock_bh(&net->nsid_lock) .. refcount_read(&peer->count) (!= 0) .. .. put_net() .. cleanup_net() .. for_each_net(tmp) .. spin_lock_bh(&tmp->nsid_lock) .. __peernet2id(tmp, net) == -1 .. .. .. .. __peernet2id_alloc(alloc == true) .. .. .. rcu_read_unlock() .. .. synchronize_rcu() .. kmem_cache_free(net) After the above situation, net::netns_id contains id pointing to freed memory, and any other dereferencing by the id will operate with this freed memory. Currently, peernet2id_alloc() is used under rtnl_lock() everywhere except ovs_vport_cmd_fill_info(), and this race can't occur. But peernet2id_alloc() is generic interface, and better we fix it before someone really starts use it in wrong context. v2: Don't place refcount_read(&net->count) under net->nsid_lock as suggested by Eric W. Biederman v3: Rebase on top of net-next Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/net_namespace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2213d45fcafd..3c77d84ad60d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -221,17 +221,26 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id); */ int peernet2id_alloc(struct net *net, struct net *peer) { - bool alloc; + bool alloc = false, alive = false; int id; if (refcount_read(&net->count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); - alloc = refcount_read(&peer->count) == 0 ? false : true; + /* + * When peer is obtained from RCU lists, we may race with + * its cleanup. Check whether it's alive, and this guarantees + * we never hash a peer back to net->netns_ids, after it has + * just been idr_remove()'d from there in cleanup_net(). + */ + if (maybe_get_net(peer)) + alive = alloc = true; id = __peernet2id_alloc(net, peer, &alloc); spin_unlock_bh(&net->nsid_lock); if (alloc && id >= 0) rtnl_net_notifyid(net, RTM_NEWNSID, id); + if (alive) + put_net(peer); return id; } EXPORT_SYMBOL_GPL(peernet2id_alloc); -- cgit From 42157277af17d5c05946c700eb03877d60760d3c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 16 Jan 2018 12:31:54 +0300 Subject: net: Remove spinlock from get_net_ns_by_id() idr_find() is safe under rcu_read_lock() and maybe_get_net() guarantees that net is alive. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/net_namespace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3c77d84ad60d..1ccb953b3b09 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -273,11 +273,9 @@ struct net *get_net_ns_by_id(struct net *net, int id) return NULL; rcu_read_lock(); - spin_lock_bh(&net->nsid_lock); peer = idr_find(&net->netns_ids, id); if (peer) peer = maybe_get_net(peer); - spin_unlock_bh(&net->nsid_lock); rcu_read_unlock(); return peer; -- cgit From 61f3c964dfd287b05d7ac6660a4f4ddfef84786c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Jan 2018 16:52:02 -0800 Subject: bpf: allow socket_filter programs to use bpf_prog_test_run in order to improve test coverage allow socket_filter program type to be run via bpf_prog_test_run command. Since such programs can be loaded by non-root tighten permissions for bpf_prog_test_run to be root only to avoid surprises. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index db2ee8c7e1bd..30fafaaa90fa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4526,6 +4526,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { }; const struct bpf_prog_ops sk_filter_prog_ops = { + .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { -- cgit From 43dd7512b51c0f3dd8170916bd3eeb2eba808ed1 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Jan 2018 03:27:42 +0000 Subject: devlink: Make some functions static Fixes the following sparse warnings: net/core/devlink.c:2297:25: warning: symbol 'devlink_resource_find' was not declared. Should it be static? net/core/devlink.c:2322:6: warning: symbol 'devlink_resource_validate_children' was not declared. Should it be static? Signed-off-by: Wei Yongjun Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index dd7d6dd07bfb..66d36705fb9d 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2294,7 +2294,7 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, counters_enable); } -struct devlink_resource * +static struct devlink_resource * devlink_resource_find(struct devlink *devlink, struct devlink_resource *resource, u64 resource_id) { @@ -2319,7 +2319,8 @@ devlink_resource_find(struct devlink *devlink, return NULL; } -void devlink_resource_validate_children(struct devlink_resource *resource) +static void +devlink_resource_validate_children(struct devlink_resource *resource) { struct devlink_resource *child_resource; bool size_valid = true; -- cgit From 205c380778d09291668da5267057a80385f89437 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:30 +0100 Subject: bpf: add csum_diff helper to xdp as well Useful for porting cls_bpf programs w/o increasing program complexity limits much at the same time, so add the helper to XDP as well. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 30fafaaa90fa..e5178851536f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3456,6 +3456,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: -- cgit From fa9dd599b4dae841924b022768354cfde9affecb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:33 +0100 Subject: bpf: get rid of pure_initcall dependency to enable jits Having a pure_initcall() callback just to permanently enable BPF JITs under CONFIG_BPF_JIT_ALWAYS_ON is unnecessary and could leave a small race window in future where JIT is still disabled on boot. Since we know about the setting at compilation time anyway, just initialize it properly there. Also consolidate all the individual bpf_jit_enable variables into a single one and move them under one location. Moreover, don't allow for setting unspecified garbage values on them. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/sysctl_net_core.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a47ad6cd41c0..6d39b4c01fc6 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,7 @@ static int zero = 0; static int one = 1; +static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -325,13 +326,14 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - .proc_handler = proc_dointvec -#else .proc_handler = proc_dointvec_minmax, +# ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = &one, .extra2 = &one, -#endif +# else + .extra1 = &zero, + .extra2 = &two, +# endif }, # ifdef CONFIG_HAVE_EBPF_JIT { @@ -339,14 +341,18 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, # endif #endif -- cgit From 2e4a30983b0f9b19b59e38bbf7427d7fdd480d98 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:34 +0100 Subject: bpf: restrict access to core bpf sysctls Given BPF reaches far beyond just networking these days, it was never intended to allow setting and in some cases reading those knobs out of a user namespace root running without CAP_SYS_ADMIN, thus tighten such access. Also the bpf_jit_enable = 2 debugging mode should only be allowed if kptr_restrict is not set since it otherwise can leak addresses to the kernel log. Dump a note to the kernel log that this is for debugging JITs only when enabled. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/sysctl_net_core.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6d39b4c01fc6..f2d0462611c3 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -251,6 +251,46 @@ static int proc_do_rss_key(struct ctl_table *table, int write, return proc_dostring(&fake_table, write, buffer, lenp, ppos); } +#ifdef CONFIG_BPF_JIT +static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, jit_enable = *(int *)table->data; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &jit_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (jit_enable < 2 || + (jit_enable == 2 && bpf_dump_raw_ok())) { + *(int *)table->data = jit_enable; + if (jit_enable == 2) + pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); + } else { + ret = -EPERM; + } + } + return ret; +} + +# ifdef CONFIG_HAVE_EBPF_JIT +static int +proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +# endif +#endif + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -326,7 +366,7 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_bpf_enable, # ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = &one, .extra2 = &one, @@ -341,7 +381,7 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = &zero, .extra2 = &two, }, @@ -350,7 +390,7 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = &zero, .extra2 = &one, }, -- cgit From 1728a4f2ad6840746a6b1b9f01d652c5842f7e8d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:37 +0100 Subject: bpf: move event_output to const_size_or_zero for xdp/skb as well Similar rationale as in a60dd35d2e39 ("bpf: change bpf_perf_event_output arg5 type to ARG_CONST_SIZE_OR_ZERO"), change the type to CONST_SIZE_OR_ZERO such that we can better deal with optimized code. No changes needed in bpf_event_output() as it can also deal with 0 size entirely (e.g. as only wake-up signal with empty frame in perf RB, or packet dumps w/o meta data as another such possibility). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e5178851536f..9b9b70dee208 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2861,7 +2861,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -3150,7 +3150,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) -- cgit From b75703de16301b80f1eedecafdf37bb02c9e155f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 22 Jan 2018 10:31:19 +0000 Subject: devlink: fix memory leak on 'resource' Currently, if the call to devlink_resource_find returns null then the error exit path does not free the devlink_resource 'resource' and a memory leak occurs. Fix this by kfree'ing resource on the error exit path. Detected by CoverityScan, CID#1464184 ("Resource leak") Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction") Signed-off-by: Colin Ian King Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 66d36705fb9d..18d385ed8237 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3182,6 +3182,7 @@ int devlink_resource_register(struct devlink *devlink, resource_list = &parent_resource->resource_list; resource->parent = parent_resource; } else { + kfree(resource); err = -EINVAL; goto out; } -- cgit From b2d3bcfa26a7a8de41f358a6cae8b848673b3c6e Mon Sep 17 00:00:00 2001 From: David Decotigny Date: Thu, 18 Jan 2018 09:59:13 -0800 Subject: net: core: Expose number of link up/down transitions Expose the number of times the link has been going UP or DOWN, and update the "carrier_changes" counter to be the sum of these two events. While at it, also update the sysfs-class-net documentation to cover: carrier_changes (3.15), carrier_up_count (4.16) and carrier_down_count (4.16) Signed-off-by: David Decotigny [Florian: * rebase * add documentation * merge carrier_changes with up/down counters] Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 25 ++++++++++++++++++++++++- net/core/rtnetlink.c | 13 +++++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7bf8b85ade16..c4a28f4667b6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -295,10 +295,31 @@ static ssize_t carrier_changes_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); return sprintf(buf, fmt_dec, - atomic_read(&netdev->carrier_changes)); + atomic_read(&netdev->carrier_up_count) + + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); +static ssize_t carrier_up_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); +} +static DEVICE_ATTR_RO(carrier_up_count); + +static ssize_t carrier_down_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); +} +static DEVICE_ATTR_RO(carrier_down_count); + /* read-write attributes */ static int change_mtu(struct net_device *dev, unsigned long new_mtu) @@ -547,6 +568,8 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, &dev_attr_proto_down.attr, + &dev_attr_carrier_up_count.attr, + &dev_attr_carrier_down_count.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 16d644a4f974..97874daa1336 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -990,6 +990,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(1) /* IFLA_PROTO_DOWN */ + nla_total_size(4) /* IFLA_IF_NETNSID */ + + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + 0; } @@ -1551,8 +1553,13 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, - atomic_read(&dev->carrier_changes)) || - nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + atomic_read(&dev->carrier_up_count) + + atomic_read(&dev->carrier_down_count)) || + nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) || + nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, + atomic_read(&dev->carrier_up_count)) || + nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, + atomic_read(&dev->carrier_down_count))) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { @@ -1656,6 +1663,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_IF_NETNSID] = { .type = NLA_S32 }, + [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, + [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { -- cgit From 5de30d5df95cfda14dfdbd6f8ed5021ab13be79b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 22 Jan 2018 19:14:27 -0800 Subject: net: core: Fix kernel-doc for call_netdevice_notifiers_info() Remove the @dev comment, since we do not have a net_device argument, fixes the following kernel doc warning: /net/core/dev.c:1707: warning: Excess function parameter 'dev' description in 'call_netdevice_notifiers_info' Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/core/dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 94435cd09072..7af0ef425ca3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1694,7 +1694,6 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function - * @dev: net_device pointer passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value -- cgit From 7a006d5988ebd99922784176d902a335b8eb5321 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 22 Jan 2018 19:14:28 -0800 Subject: net: core: Fix kernel-doc for netdev_upper_link() Fixes the following warnings: ./net/core/dev.c:6438: warning: No description found for parameter 'extack' ./net/core/dev.c:6461: warning: No description found for parameter 'extack' Fixes: 42ab19ee9029 ("net: Add extack to upper device linking") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7af0ef425ca3..77795f66c246 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6424,6 +6424,7 @@ rollback: * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device + * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. @@ -6445,6 +6446,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * @upper_dev: new upper device * @upper_priv: upper device private * @upper_info: upper info to be passed down via notifier + * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices -- cgit From b76f4189df5c54a892ae54ac23908cc54ae7134f Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 22 Jan 2018 08:07:19 -0800 Subject: net: link_watch: mark bonding link events urgent It takes 1sec for bond link down notification to hit user-space when all slaves of the bond go down. 1sec is too long for protocol daemons in user-space relying on bond notification to recover (eg: multichassis lag implementations in user-space). Since the link event code already marks team device port link events as urgent, this patch moves the code to cover all lag ports and master. Signed-off-by: Roopa Prabhu Acked-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/link_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 982861607f88..e38e641e98d5 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -92,7 +92,7 @@ static bool linkwatch_urgent_event(struct net_device *dev) if (dev->ifindex != dev_get_iflink(dev)) return true; - if (dev->priv_flags & IFF_TEAM_PORT) + if (netif_is_lag_port(dev) || netif_is_lag_master(dev)) return true; return netif_carrier_ok(dev) && qdisc_tx_changing(dev); -- cgit From 57a5749b0fa3639a228d7c0ac080ee1704abe8fd Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 18 Jan 2018 18:31:34 +0000 Subject: pktgen: Add missing !flag parameters o FLOW_SEQ now can be disabled with pgset "flag !FLOW_SEQ" o FLOW_SEQ and FLOW_RND are antonyms, as it's shown by pktgen_if_show() o IPSEC now may be disabled Note, that IPV6 is enabled with dst6/src6 parameters, not with a flag parameter. Signed-off-by: Dmitry Safonov Signed-off-by: David S. Miller --- net/core/pktgen.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 4fcfcb14e7c6..20f1c873a1ed 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1284,9 +1284,12 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!SVID_RND") == 0) pkt_dev->flags &= ~F_SVID_RND; - else if (strcmp(f, "FLOW_SEQ") == 0) + else if (strcmp(f, "FLOW_SEQ") == 0 || strcmp(f, "!FLOW_RND") == 0) pkt_dev->flags |= F_FLOW_SEQ; + else if (strcmp(f, "FLOW_RND") == 0 || strcmp(f, "!FLOW_SEQ") == 0) + pkt_dev->flags &= ~F_FLOW_SEQ; + else if (strcmp(f, "QUEUE_MAP_RND") == 0) pkt_dev->flags |= F_QUEUE_MAP_RND; @@ -1301,6 +1304,9 @@ static ssize_t pktgen_if_write(struct file *file, #ifdef CONFIG_XFRM else if (strcmp(f, "IPSEC") == 0) pkt_dev->flags |= F_IPSEC_ON; + + else if (strcmp(f, "!IPSEC") == 0) + pkt_dev->flags &= ~F_IPSEC_ON; #endif else if (strcmp(f, "!IPV6") == 0) -- cgit From 6f107c741212ca5fbfb0724571395716420017d6 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 18 Jan 2018 18:31:35 +0000 Subject: pktgen: Add behaviour flags macro to generate flags/names PKT_FALGS macro will be used to add package behavior names definitions to simplify the code that prints/reads pkg flags. Sorted the array in order of printing the flags in pktgen_if_show() Note: Renamed IPSEC_ON => IPSEC for simplicity. No visible behavior change expected. Signed-off-by: Dmitry Safonov Signed-off-by: David S. Miller --- net/core/pktgen.c | 57 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 20f1c873a1ed..e6d862214cc0 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -184,25 +184,36 @@ #define func_enter() pr_debug("entering %s\n", __func__); +#define PKT_FLAGS \ + pf(IPV6) /* Interface in IPV6 Mode */ \ + pf(IPSRC_RND) /* IP-Src Random */ \ + pf(IPDST_RND) /* IP-Dst Random */ \ + pf(TXSIZE_RND) /* Transmit size is random */ \ + pf(UDPSRC_RND) /* UDP-Src Random */ \ + pf(UDPDST_RND) /* UDP-Dst Random */ \ + pf(UDPCSUM) /* Include UDP checksum */ \ + pf(NO_TIMESTAMP) /* Don't timestamp packets (default TS) */ \ + pf(MPLS_RND) /* Random MPLS labels */ \ + pf(QUEUE_MAP_RND) /* queue map Random */ \ + pf(QUEUE_MAP_CPU) /* queue map mirrors smp_processor_id() */ \ + pf(FLOW_SEQ) /* Sequential flows */ \ + pf(IPSEC) /* ipsec on for flows */ \ + pf(MACSRC_RND) /* MAC-Src Random */ \ + pf(MACDST_RND) /* MAC-Dst Random */ \ + pf(VID_RND) /* Random VLAN ID */ \ + pf(SVID_RND) /* Random SVLAN ID */ \ + pf(NODE) /* Node memory alloc*/ \ + +#define pf(flag) flag##_SHIFT, +enum pkt_flags { + PKT_FLAGS +}; +#undef pf + /* Device flag bits */ -#define F_IPSRC_RND (1<<0) /* IP-Src Random */ -#define F_IPDST_RND (1<<1) /* IP-Dst Random */ -#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ -#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ -#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ -#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ -#define F_TXSIZE_RND (1<<6) /* Transmit size is random */ -#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ -#define F_MPLS_RND (1<<8) /* Random MPLS labels */ -#define F_VID_RND (1<<9) /* Random VLAN ID */ -#define F_SVID_RND (1<<10) /* Random SVLAN ID */ -#define F_FLOW_SEQ (1<<11) /* Sequential flows */ -#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ -#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ -#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ -#define F_NODE (1<<15) /* Node memory alloc*/ -#define F_UDPCSUM (1<<16) /* Include UDP checksum */ -#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */ +#define pf(flag) static const __u32 F_##flag = (1<flags & F_IPSEC_ON) { + if (pkt_dev->flags & F_IPSEC) { seq_puts(seq, "IPSEC "); if (pkt_dev->spi) seq_printf(seq, "spi:%u", pkt_dev->spi); @@ -1303,10 +1314,10 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->flags &= ~F_QUEUE_MAP_CPU; #ifdef CONFIG_XFRM else if (strcmp(f, "IPSEC") == 0) - pkt_dev->flags |= F_IPSEC_ON; + pkt_dev->flags |= F_IPSEC; else if (strcmp(f, "!IPSEC") == 0) - pkt_dev->flags &= ~F_IPSEC_ON; + pkt_dev->flags &= ~F_IPSEC; #endif else if (strcmp(f, "!IPV6") == 0) @@ -2547,7 +2558,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; #ifdef CONFIG_XFRM - if (pkt_dev->flags & F_IPSEC_ON) + if (pkt_dev->flags & F_IPSEC) get_ipsec_sa(pkt_dev, flow); #endif pkt_dev->nflows++; @@ -2652,7 +2663,7 @@ static void free_SAs(struct pktgen_dev *pkt_dev) static int process_ipsec(struct pktgen_dev *pkt_dev, struct sk_buff *skb, __be16 protocol) { - if (pkt_dev->flags & F_IPSEC_ON) { + if (pkt_dev->flags & F_IPSEC) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; if (x) { -- cgit From 99c6d3d20d6287d7dd6e65686dba9e696de91f90 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 18 Jan 2018 18:31:36 +0000 Subject: pktgen: Remove brute-force printing of flags Add macro generated pkt_flag_names array, with a little help of which the flags can be printed by using an index. Signed-off-by: Dmitry Safonov Signed-off-by: David S. Miller --- net/core/pktgen.c | 77 ++++++++++++++----------------------------------------- 1 file changed, 19 insertions(+), 58 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index e6d862214cc0..17fb036d7297 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -215,6 +215,14 @@ enum pkt_flags { PKT_FLAGS #undef pf +#define pf(flag) __stringify(flag), +static char *pkt_flag_names[] = { + PKT_FLAGS +}; +#undef pf + +#define NR_PKT_FLAGS ARRAY_SIZE(pkt_flag_names) + /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ #define T_RUN (1<<1) /* Start run */ @@ -545,6 +553,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) { const struct pktgen_dev *pkt_dev = seq->private; ktime_t stopped; + unsigned int i; u64 idle; seq_printf(seq, @@ -606,7 +615,6 @@ static int pktgen_if_show(struct seq_file *seq, void *v) pkt_dev->src_mac_count, pkt_dev->dst_mac_count); if (pkt_dev->nr_labels) { - unsigned int i; seq_puts(seq, " mpls: "); for (i = 0; i < pkt_dev->nr_labels; i++) seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]), @@ -642,68 +650,21 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_puts(seq, " Flags: "); - if (pkt_dev->flags & F_IPV6) - seq_puts(seq, "IPV6 "); - - if (pkt_dev->flags & F_IPSRC_RND) - seq_puts(seq, "IPSRC_RND "); - - if (pkt_dev->flags & F_IPDST_RND) - seq_puts(seq, "IPDST_RND "); - - if (pkt_dev->flags & F_TXSIZE_RND) - seq_puts(seq, "TXSIZE_RND "); - - if (pkt_dev->flags & F_UDPSRC_RND) - seq_puts(seq, "UDPSRC_RND "); - - if (pkt_dev->flags & F_UDPDST_RND) - seq_puts(seq, "UDPDST_RND "); - - if (pkt_dev->flags & F_UDPCSUM) - seq_puts(seq, "UDPCSUM "); - - if (pkt_dev->flags & F_NO_TIMESTAMP) - seq_puts(seq, "NO_TIMESTAMP "); - - if (pkt_dev->flags & F_MPLS_RND) - seq_puts(seq, "MPLS_RND "); - - if (pkt_dev->flags & F_QUEUE_MAP_RND) - seq_puts(seq, "QUEUE_MAP_RND "); - - if (pkt_dev->flags & F_QUEUE_MAP_CPU) - seq_puts(seq, "QUEUE_MAP_CPU "); + for (i = 0; i < NR_PKT_FLAGS; i++) { + if (i == F_FLOW_SEQ) + if (!pkt_dev->cflows) + continue; - if (pkt_dev->cflows) { - if (pkt_dev->flags & F_FLOW_SEQ) - seq_puts(seq, "FLOW_SEQ "); /*in sequence flows*/ - else - seq_puts(seq, "FLOW_RND "); - } + if (pkt_dev->flags & (1 << i)) + seq_printf(seq, "%s ", pkt_flag_names[i]); + else if (i == F_FLOW_SEQ) + seq_puts(seq, "FLOW_RND "); #ifdef CONFIG_XFRM - if (pkt_dev->flags & F_IPSEC) { - seq_puts(seq, "IPSEC "); - if (pkt_dev->spi) + if (i == F_IPSEC && pkt_dev->spi) seq_printf(seq, "spi:%u", pkt_dev->spi); - } #endif - - if (pkt_dev->flags & F_MACSRC_RND) - seq_puts(seq, "MACSRC_RND "); - - if (pkt_dev->flags & F_MACDST_RND) - seq_puts(seq, "MACDST_RND "); - - if (pkt_dev->flags & F_VID_RND) - seq_puts(seq, "VID_RND "); - - if (pkt_dev->flags & F_SVID_RND) - seq_puts(seq, "SVID_RND "); - - if (pkt_dev->flags & F_NODE) - seq_puts(seq, "NODE_ALLOC "); + } seq_puts(seq, "\n"); -- cgit From 52e12d5daea48c92f0b0354f4cdc127a2c0a3c52 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 18 Jan 2018 18:31:37 +0000 Subject: pktgen: Clean read user supplied flag mess Don't use error-prone-brute-force way. Signed-off-by: Dmitry Safonov Signed-off-by: David S. Miller --- net/core/pktgen.c | 144 +++++++++++++++--------------------------------------- 1 file changed, 39 insertions(+), 105 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 17fb036d7297..b8ab5c829511 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -830,6 +830,35 @@ static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) return i; } +static __u32 pktgen_read_flag(const char *f, bool *disable) +{ + __u32 i; + + if (f[0] == '!') { + *disable = true; + f++; + } + + for (i = 0; i < NR_PKT_FLAGS; i++) { + if (!IS_ENABLED(CONFIG_XFRM) && i == IPSEC_SHIFT) + continue; + + /* allow only disabling ipv6 flag */ + if (!*disable && i == IPV6_SHIFT) + continue; + + if (strcmp(f, pkt_flag_names[i]) == 0) + return 1 << i; + } + + if (strcmp(f, "FLOW_RND") == 0) { + *disable = !*disable; + return F_FLOW_SEQ; + } + + return 0; +} + static ssize_t pktgen_if_write(struct file *file, const char __user * user_buffer, size_t count, loff_t * offset) @@ -1187,7 +1216,10 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "flag")) { + __u32 flag; char f[32]; + bool disable = false; + memset(f, 0, 32); len = strn_len(&user_buffer[i], sizeof(f) - 1); if (len < 0) @@ -1196,113 +1228,15 @@ static ssize_t pktgen_if_write(struct file *file, if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; i += len; - if (strcmp(f, "IPSRC_RND") == 0) - pkt_dev->flags |= F_IPSRC_RND; - - else if (strcmp(f, "!IPSRC_RND") == 0) - pkt_dev->flags &= ~F_IPSRC_RND; - - else if (strcmp(f, "TXSIZE_RND") == 0) - pkt_dev->flags |= F_TXSIZE_RND; - - else if (strcmp(f, "!TXSIZE_RND") == 0) - pkt_dev->flags &= ~F_TXSIZE_RND; - - else if (strcmp(f, "IPDST_RND") == 0) - pkt_dev->flags |= F_IPDST_RND; - - else if (strcmp(f, "!IPDST_RND") == 0) - pkt_dev->flags &= ~F_IPDST_RND; - - else if (strcmp(f, "UDPSRC_RND") == 0) - pkt_dev->flags |= F_UDPSRC_RND; - - else if (strcmp(f, "!UDPSRC_RND") == 0) - pkt_dev->flags &= ~F_UDPSRC_RND; - - else if (strcmp(f, "UDPDST_RND") == 0) - pkt_dev->flags |= F_UDPDST_RND; - - else if (strcmp(f, "!UDPDST_RND") == 0) - pkt_dev->flags &= ~F_UDPDST_RND; - - else if (strcmp(f, "MACSRC_RND") == 0) - pkt_dev->flags |= F_MACSRC_RND; - - else if (strcmp(f, "!MACSRC_RND") == 0) - pkt_dev->flags &= ~F_MACSRC_RND; - else if (strcmp(f, "MACDST_RND") == 0) - pkt_dev->flags |= F_MACDST_RND; + flag = pktgen_read_flag(f, &disable); - else if (strcmp(f, "!MACDST_RND") == 0) - pkt_dev->flags &= ~F_MACDST_RND; - - else if (strcmp(f, "MPLS_RND") == 0) - pkt_dev->flags |= F_MPLS_RND; - - else if (strcmp(f, "!MPLS_RND") == 0) - pkt_dev->flags &= ~F_MPLS_RND; - - else if (strcmp(f, "VID_RND") == 0) - pkt_dev->flags |= F_VID_RND; - - else if (strcmp(f, "!VID_RND") == 0) - pkt_dev->flags &= ~F_VID_RND; - - else if (strcmp(f, "SVID_RND") == 0) - pkt_dev->flags |= F_SVID_RND; - - else if (strcmp(f, "!SVID_RND") == 0) - pkt_dev->flags &= ~F_SVID_RND; - - else if (strcmp(f, "FLOW_SEQ") == 0 || strcmp(f, "!FLOW_RND") == 0) - pkt_dev->flags |= F_FLOW_SEQ; - - else if (strcmp(f, "FLOW_RND") == 0 || strcmp(f, "!FLOW_SEQ") == 0) - pkt_dev->flags &= ~F_FLOW_SEQ; - - else if (strcmp(f, "QUEUE_MAP_RND") == 0) - pkt_dev->flags |= F_QUEUE_MAP_RND; - - else if (strcmp(f, "!QUEUE_MAP_RND") == 0) - pkt_dev->flags &= ~F_QUEUE_MAP_RND; - - else if (strcmp(f, "QUEUE_MAP_CPU") == 0) - pkt_dev->flags |= F_QUEUE_MAP_CPU; - - else if (strcmp(f, "!QUEUE_MAP_CPU") == 0) - pkt_dev->flags &= ~F_QUEUE_MAP_CPU; -#ifdef CONFIG_XFRM - else if (strcmp(f, "IPSEC") == 0) - pkt_dev->flags |= F_IPSEC; - - else if (strcmp(f, "!IPSEC") == 0) - pkt_dev->flags &= ~F_IPSEC; -#endif - - else if (strcmp(f, "!IPV6") == 0) - pkt_dev->flags &= ~F_IPV6; - - else if (strcmp(f, "NODE_ALLOC") == 0) - pkt_dev->flags |= F_NODE; - - else if (strcmp(f, "!NODE_ALLOC") == 0) - pkt_dev->flags &= ~F_NODE; - - else if (strcmp(f, "UDPCSUM") == 0) - pkt_dev->flags |= F_UDPCSUM; - - else if (strcmp(f, "!UDPCSUM") == 0) - pkt_dev->flags &= ~F_UDPCSUM; - - else if (strcmp(f, "NO_TIMESTAMP") == 0) - pkt_dev->flags |= F_NO_TIMESTAMP; - - else if (strcmp(f, "!NO_TIMESTAMP") == 0) - pkt_dev->flags &= ~F_NO_TIMESTAMP; - - else { + if (flag) { + if (disable) + pkt_dev->flags &= ~flag; + else + pkt_dev->flags |= flag; + } else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", f, -- cgit From 36fd633ec98acd2028585c22128fcaa3da6d5770 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 26 Jun 2017 13:19:16 -0400 Subject: net: separate SIOCGIFCONF handling from dev_ioctl() Only two of dev_ioctl() callers may pass SIOCGIFCONF to it. Separating that codepath from the rest of dev_ioctl() allows both to simplify dev_ioctl() itself (all other cases work with struct ifreq *) *and* seriously simplify the compat side of that beast: all it takes is passing to inet_gifconf() an extra argument - the size of individual records (sizeof(struct ifreq) or sizeof(struct compat_ifreq)). With dev_ifconf() called directly from sock_do_ioctl()/compat_dev_ifconf() that's easy to arrange. As the result, compat side of SIOCGIFCONF doesn't need any allocations, copy_in_user() back and forth, etc. Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- net/core/dev_ioctl.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 7e690d0ccd05..5cdec23dd28e 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -66,9 +66,8 @@ EXPORT_SYMBOL(register_gifconf); * Thus we will need a 'compatibility mode'. */ -static int dev_ifconf(struct net *net, char __user *arg) +int dev_ifconf(struct net *net, struct ifconf *ifc, int size) { - struct ifconf ifc; struct net_device *dev; char __user *pos; int len; @@ -79,11 +78,8 @@ static int dev_ifconf(struct net *net, char __user *arg) * Fetch the caller's info block. */ - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; - - pos = ifc.ifc_buf; - len = ifc.ifc_len; + pos = ifc->ifc_buf; + len = ifc->ifc_len; /* * Loop over the interfaces, and write an info block for each. @@ -95,10 +91,10 @@ static int dev_ifconf(struct net *net, char __user *arg) if (gifconf_list[i]) { int done; if (!pos) - done = gifconf_list[i](dev, NULL, 0); + done = gifconf_list[i](dev, NULL, 0, size); else done = gifconf_list[i](dev, pos + total, - len - total); + len - total, size); if (done < 0) return -EFAULT; total += done; @@ -109,12 +105,12 @@ static int dev_ifconf(struct net *net, char __user *arg) /* * All done. Write the updated control block back to the caller. */ - ifc.ifc_len = total; + ifc->ifc_len = total; /* * Both BSD and Solaris return 0 here, so we do too. */ - return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; + return 0; } /* @@ -412,17 +408,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) int ret; char *colon; - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ - - if (cmd == SIOCGIFCONF) { - rtnl_lock(); - ret = dev_ifconf(net, (char __user *) arg); - rtnl_unlock(); - return ret; - } if (cmd == SIOCGIFNAME) return dev_ifname(net, (struct ifreq __user *)arg); -- cgit From b1b0c245067268043e0e832432f3d537a5cae33b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 1 Oct 2017 20:13:08 -0400 Subject: lift handling of SIOCIW... out of dev_ioctl() Signed-off-by: Al Viro --- net/core/dev_ioctl.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 5cdec23dd28e..d262f159f9fd 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -411,24 +411,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (cmd == SIOCGIFNAME) return dev_ifname(net, (struct ifreq __user *)arg); - /* - * Take care of Wireless Extensions. Unfortunately struct iwreq - * isn't a proper subset of struct ifreq (it's 8 byte shorter) - * so we need to treat it specially, otherwise applications may - * fault if the struct they're passing happens to land at the - * end of a mapped page. - */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { - struct iwreq iwr; - - if (copy_from_user(&iwr, arg, sizeof(iwr))) - return -EFAULT; - - iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0; - - return wext_handle_ioctl(net, &iwr, cmd, arg); - } - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; -- cgit From 44c02a2c3dc55835e9f0d8ef73966406cd805001 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 5 Oct 2017 12:59:44 -0400 Subject: dev_ioctl(): move copyin/copyout to callers Signed-off-by: Al Viro --- net/core/dev_ioctl.c | 85 +++++++++++++++------------------------------------- 1 file changed, 24 insertions(+), 61 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index d262f159f9fd..0ab1af04296c 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -18,26 +18,10 @@ * match. --pb */ -static int dev_ifname(struct net *net, struct ifreq __user *arg) +static int dev_ifname(struct net *net, struct ifreq *ifr) { - struct ifreq ifr; - int error; - - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - ifr.ifr_name[IFNAMSIZ-1] = 0; - - error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex); - if (error) - return error; - - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return 0; + ifr->ifr_name[IFNAMSIZ-1] = 0; + return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } static gifconf_func_t *gifconf_list[NPROTO]; @@ -402,24 +386,24 @@ EXPORT_SYMBOL(dev_load); * positive or a negative errno code on error. */ -int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout) { - struct ifreq ifr; int ret; char *colon; + if (need_copyout) + *need_copyout = true; if (cmd == SIOCGIFNAME) - return dev_ifname(net, (struct ifreq __user *)arg); - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; + return dev_ifname(net, ifr); - ifr.ifr_name[IFNAMSIZ-1] = 0; + ifr->ifr_name[IFNAMSIZ-1] = 0; - colon = strchr(ifr.ifr_name, ':'); + colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; + dev_load(net, ifr->ifr_name); + /* * See which interface the caller is talking about. */ @@ -439,31 +423,19 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCGIFMAP: case SIOCGIFINDEX: case SIOCGIFTXQLEN: - dev_load(net, ifr.ifr_name); rcu_read_lock(); - ret = dev_ifsioc_locked(net, &ifr, cmd); + ret = dev_ifsioc_locked(net, ifr, cmd); rcu_read_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + if (colon) + *colon = ':'; return ret; case SIOCETHTOOL: - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ethtool(net, &ifr); + ret = dev_ethtool(net, ifr); rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + if (colon) + *colon = ':'; return ret; /* @@ -477,17 +449,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFNAME: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); + ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + if (colon) + *colon = ':'; return ret; /* @@ -528,10 +494,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) /* fall through */ case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); + ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); + if (need_copyout) + *need_copyout = false; return ret; case SIOCGIFMEM: @@ -551,13 +518,9 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) cmd == SIOCGHWTSTAMP || (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); + ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; return ret; } return -ENOTTY; -- cgit From fb07a820fe3fedabffc57863e0f823c912d81bad Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 19 Jan 2018 19:14:53 +0300 Subject: net: Move net:netns_ids destruction out of rtnl_lock() and document locking scheme Currently, we unhash a dying net from netns_ids lists under rtnl_lock(). It's a leftover from the time when net::netns_ids was introduced. There was no net::nsid_lock, and rtnl_lock() was mostly need to order modification of alive nets nsid idr, i.e. for: for_each_net(tmp) { ... id = __peernet2id(tmp, net); idr_remove(&tmp->netns_ids, id); ... } Since we have net::nsid_lock, the modifications are protected by this local lock, and now we may introduce better scheme of netns_ids destruction. Let's look at the functions peernet2id_alloc() and get_net_ns_by_id(). Previous commits taught these functions to work well with dying net acquired from rtnl unlocked lists. And they are the only functions which can hash a net to netns_ids or obtain from there. And as easy to check, other netns_ids operating functions works with id, not with net pointers. So, we do not need rtnl_lock to synchronize cleanup_net() with all them. The another property, which is used in the patch, is that net is unhashed from net_namespace_list in the only place and by the only process. So, we avoid excess rcu_read_lock() or rtnl_lock(), when we'are iterating over the list in unhash_nsid(). All the above makes possible to keep rtnl_lock() locked only for net->list deletion, and completely avoid it for netns_ids unhashing and destruction. As these two doings may take long time (e.g., memory allocation to send skb), the patch should positively act on the scalability and signify decrease the time, which rtnl_lock() is held in cleanup_net(). Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/net_namespace.c | 62 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1ccb953b3b09..3cad5f51afd3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -439,13 +439,40 @@ struct net *copy_net_ns(unsigned long flags, return net; } +static void unhash_nsid(struct net *net, struct net *last) +{ + struct net *tmp; + /* This function is only called from cleanup_net() work, + * and this work is the only process, that may delete + * a net from net_namespace_list. So, when the below + * is executing, the list may only grow. Thus, we do not + * use for_each_net_rcu() or rtnl_lock(). + */ + for_each_net(tmp) { + int id; + + spin_lock_bh(&tmp->nsid_lock); + id = __peernet2id(tmp, net); + if (id >= 0) + idr_remove(&tmp->netns_ids, id); + spin_unlock_bh(&tmp->nsid_lock); + if (id >= 0) + rtnl_net_notifyid(tmp, RTM_DELNSID, id); + if (tmp == last) + break; + } + spin_lock_bh(&net->nsid_lock); + idr_destroy(&net->netns_ids); + spin_unlock_bh(&net->nsid_lock); +} + static DEFINE_SPINLOCK(cleanup_list_lock); static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; - struct net *net, *tmp; + struct net *net, *tmp, *last; struct list_head net_kill_list; LIST_HEAD(net_exit_list); @@ -458,26 +485,25 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) { + list_for_each_entry(net, &net_kill_list, cleanup_list) list_del_rcu(&net->list); - list_add_tail(&net->exit_list, &net_exit_list); - for_each_net(tmp) { - int id; - - spin_lock_bh(&tmp->nsid_lock); - id = __peernet2id(tmp, net); - if (id >= 0) - idr_remove(&tmp->netns_ids, id); - spin_unlock_bh(&tmp->nsid_lock); - if (id >= 0) - rtnl_net_notifyid(tmp, RTM_DELNSID, id); - } - spin_lock_bh(&net->nsid_lock); - idr_destroy(&net->netns_ids); - spin_unlock_bh(&net->nsid_lock); + /* Cache last net. After we unlock rtnl, no one new net + * added to net_namespace_list can assign nsid pointer + * to a net from net_kill_list (see peernet2id_alloc()). + * So, we skip them in unhash_nsid(). + * + * Note, that unhash_nsid() does not delete nsid links + * between net_kill_list's nets, as they've already + * deleted from net_namespace_list. But, this would be + * useless anyway, as netns_ids are destroyed there. + */ + last = list_last_entry(&net_namespace_list, struct net, list); + rtnl_unlock(); + list_for_each_entry(net, &net_kill_list, cleanup_list) { + unhash_nsid(net, last); + list_add_tail(&net->exit_list, &net_exit_list); } - rtnl_unlock(); /* * Another CPU might be rcu-iterating the list, wait for it. -- cgit From 2585cd62f0986a6e6d9c83363ed6dbcc66bc9f32 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:05 -0800 Subject: bpf: Only reply field should be writeable Currently, a sock_ops BPF program can write the op field and all the reply fields (reply and replylong). This is a bug. The op field should not have been writeable and there is currently no way to use replylong field for indices >= 1. This patch enforces that only the reply field (which equals replylong[0]) is writeable. Fixes: 40304b2a1567 ("bpf: BPF support for sock_ops") Signed-off-by: Lawrence Brakmo Acked-by: Yuchung Cheng Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 18da42a81d0c..bf9bb755e369 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3845,8 +3845,7 @@ static bool sock_ops_is_valid_access(int off, int size, { if (type == BPF_WRITE) { switch (off) { - case offsetof(struct bpf_sock_ops, op) ... - offsetof(struct bpf_sock_ops, replylong[3]): + case offsetof(struct bpf_sock_ops, reply): break; default: return false; -- cgit From a33de3973488680def6a900e74ec67ba2bef226c Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:06 -0800 Subject: bpf: Make SOCK_OPS_GET_TCP size independent Make SOCK_OPS_GET_TCP helper macro size independent (before only worked with 4-byte fields. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index bf9bb755e369..62e78748d95f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4470,9 +4470,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; /* Helper macro for adding read access to tcp_sock fields. */ -#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ +#define SOCK_OPS_GET_TCP(FIELD_NAME) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) > \ + FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ @@ -4484,16 +4485,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ + *insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock, \ + FIELD_NAME), si->dst_reg, \ + si->dst_reg, \ offsetof(struct tcp_sock, FIELD_NAME)); \ } while (0) case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_TCP32(snd_cwnd); + SOCK_OPS_GET_TCP(snd_cwnd); break; case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_TCP32(srtt_us); + SOCK_OPS_GET_TCP(srtt_us); break; } return insn - insn_buf; -- cgit From 34d367c59233464dbd1f07445c4665099a7128ec Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:07 -0800 Subject: bpf: Make SOCK_OPS_GET_TCP struct independent Changed SOCK_OPS_GET_TCP to SOCK_OPS_GET_FIELD and added 2 arguments so now it can also work with struct sock fields. The first argument is the name of the field in the bpf_sock_ops struct, the 2nd argument is the name of the field in the OBJ struct. Previous: SOCK_OPS_GET_TCP(FIELD_NAME) New: SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) Where OBJ is either "struct tcp_sock" or "struct sock" (without quotation). BPF_FIELD is the name of the field in the bpf_sock_ops struct and OBJ_FIELD is the name of the field in the OBJ struct. Although the field names are currently the same, the kernel struct names could change in the future and this change makes it easier to support that. Note that adding access to tcp_sock fields in sock_ops programs does not preclude the tcp_sock fields from being removed as long as we are willing to do one of the following: 1) Return a fixed value (e.x. 0 or 0xffffffff), or 2) Make the verifier fail if that field is accessed (i.e. program fails to load) so the user will know that field is no longer supported. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 62e78748d95f..dbb6d2f60680 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4469,11 +4469,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, is_fullsock)); break; -/* Helper macro for adding read access to tcp_sock fields. */ -#define SOCK_OPS_GET_TCP(FIELD_NAME) \ +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) > \ - FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME)); \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ @@ -4485,18 +4485,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock, \ - FIELD_NAME), si->dst_reg, \ - si->dst_reg, \ - offsetof(struct tcp_sock, FIELD_NAME)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ } while (0) case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_TCP(snd_cwnd); + SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_TCP(srtt_us); + SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; } return insn - insn_buf; -- cgit From b73042b8a28e2603ac178295ab96c876ba5a97a1 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:08 -0800 Subject: bpf: Add write access to tcp_sock and sock fields This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to struct tcp_sock or struct sock fields. This required adding a new field "temp" to struct bpf_sock_ops_kern for temporary storage that is used by sock_ops_convert_ctx_access. It is used to store and recover the contents of a register, so the register can be used to store the address of the sk. Since we cannot overwrite the dst_reg because it contains the pointer to ctx, nor the src_reg since it contains the value we want to store, we need an extra register to contain the address of the sk. Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the GET or SET macros depending on the value of the TYPE field. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index dbb6d2f60680..c356ec02b1a5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4491,6 +4491,54 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, offsetof(OBJ, OBJ_FIELD)); \ } while (0) +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + } while (0) + case offsetof(struct bpf_sock_ops, snd_cwnd): SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); break; -- cgit From b13d880721729384757f235166068c315326f4a1 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:10 -0800 Subject: bpf: Adds field bpf_sock_ops_cb_flags to tcp_sock Adds field bpf_sock_ops_cb_flags to tcp_sock and bpf_sock_ops. Its primary use is to determine if there should be calls to sock_ops bpf program at various points in the TCP code. The field is initialized to zero, disabling the calls. A sock_ops BPF program can set it, per connection and as necessary, when the connection is established. It also adds support for reading and writting the field within a sock_ops BPF program. Reading is done by accessing the field directly. However, writing is done through the helper function bpf_sock_ops_cb_flags_set, in order to return an error if a BPF program is trying to set a callback that is not supported in the current kernel (i.e. running an older kernel). The helper function returns 0 if it was able to set all of the bits set in the argument, a positive number containing the bits that could not be set, or -EINVAL if the socket is not a full TCP socket. Examples of where one could call the bpf program: 1) When RTO fires 2) When a packet is retransmitted 3) When the connection terminates 4) When a packet is sent 5) When a packet is received Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index c356ec02b1a5..6936d19ac736 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3328,6 +3328,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!sk_fullsock(sk)) + return -EINVAL; + +#ifdef CONFIG_INET + if (val) + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +#else + return -EINVAL; +#endif +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3510,6 +3537,8 @@ static const struct bpf_func_proto * return &bpf_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: @@ -4546,6 +4575,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, srtt_us): SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; } return insn - insn_buf; } -- cgit From 44f0e43037d3a17b043843ba67610ac7c7e37db6 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:12 -0800 Subject: bpf: Add support for reading sk_state and more Add support for reading many more tcp_sock fields state, same as sk->sk_state rtt_min same as sk->rtt_min.s[0].v (current rtt_min) snd_ssthresh rcv_nxt snd_nxt snd_una mss_cache ecn_flags rate_delivered rate_interval_us packets_out retrans_out total_retrans segs_in data_segs_in segs_out data_segs_out lost_out sacked_out sk_txhash bytes_received (__u64) bytes_acked (__u64) Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 132 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 6936d19ac736..a858ebc4ece4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3855,33 +3855,43 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static bool __is_valid_sock_ops_access(int off, int size) +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; - - return true; -} -static bool sock_ops_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): + if (size != size_default) + return false; break; default: return false; } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + default: + if (size != size_default) + return false; + break; + } } - return __is_valid_sock_ops_access(off, size); + return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -4498,6 +4508,32 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, is_fullsock)); break; + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + FIELD_SIZEOF(struct minmax_sample, t)); + break; + /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ @@ -4580,6 +4616,91 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_FIELD(total_retrans, total_retrans, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_FIELD(bytes_received, bytes_received, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); + break; } return insn - insn_buf; } -- cgit From 6f9bd3d731aac0d2ac21dd78a642af5df38fb5c5 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:13 -0800 Subject: bpf: Add sock_ops R/W access to tclass Adds direct write access to sk_txhash and access to tclass for ipv6 flows through getsockopt and setsockopt. Sample usage for tclass: bpf_getsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, sizeof(v)) where skops is a pointer to the ctx (struct bpf_sock_ops). Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index a858ebc4ece4..fe2c7937351f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3232,6 +3232,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (val == -1) + val = 0; + np->tclass = val; + } + break; + default: + ret = -EINVAL; + } +#endif } else if (level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { @@ -3241,7 +3264,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, reinit); + ret = tcp_set_congestion_control(sk, name, false, + reinit); } else { struct tcp_sock *tp = tcp_sk(sk); @@ -3307,6 +3331,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + *((int *)optval) = (int)np->tclass; + break; + default: + goto err_clear; + } +#endif } else { goto err_clear; } @@ -3871,6 +3911,7 @@ static bool sock_ops_is_valid_access(int off, int size, if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): + case offsetof(struct bpf_sock_ops, sk_txhash): if (size != size_default) return false; break; @@ -4690,7 +4731,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, sk_txhash): - SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock); + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, + struct sock, type); break; case offsetof(struct bpf_sock_ops, bytes_received): @@ -4701,6 +4743,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, bytes_acked): SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); break; + } return insn - insn_buf; } -- cgit From 1d621674d923790d09cab9e2c7da7da6446a6257 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:36 +0100 Subject: bpf: xor of a/x in cbpf can be done in 32 bit alu Very minor optimization; saves 1 byte per program in x86_64 JIT in cBPF prologue. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index fe2c7937351f..60d8c8712652 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -401,8 +401,8 @@ do_pass: /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to -- cgit From f6b1b3bf0d5f681631a293cfe1ca934b81716f1e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:39 +0100 Subject: bpf: fix subprog verifier bypass by div/mod by 0 exception One of the ugly leftovers from the early eBPF days is that div/mod operations based on registers have a hard-coded src_reg == 0 test in the interpreter as well as in JIT code generators that would return from the BPF program with exit code 0. This was basically adopted from cBPF interpreter for historical reasons. There are multiple reasons why this is very suboptimal and prone to bugs. To name one: the return code mapping for such abnormal program exit of 0 does not always match with a suitable program type's exit code mapping. For example, '0' in tc means action 'ok' where the packet gets passed further up the stack, which is just undesirable for such cases (e.g. when implementing policy) and also does not match with other program types. While trying to work out an exception handling scheme, I also noticed that programs crafted like the following will currently pass the verifier: 0: (bf) r6 = r1 1: (85) call pc+8 caller: R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 callee: frame1: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_1 10: (b4) (u32) r2 = (u32) 0 11: (b4) (u32) r3 = (u32) 1 12: (3c) (u32) r3 /= (u32) r2 13: (61) r0 = *(u32 *)(r1 +76) 14: (95) exit returning from callee: frame1: R0_w=pkt(id=0,off=0,r=0,imm=0) R1=ctx(id=0,off=0,imm=0) R2_w=inv0 R3_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0,call_1 to caller at 2: R0_w=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 from 14 to 2: R0=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 2: (bf) r1 = r6 3: (61) r1 = *(u32 *)(r1 +80) 4: (bf) r2 = r0 5: (07) r2 += 8 6: (2d) if r2 > r1 goto pc+1 R0=pkt(id=0,off=0,r=8,imm=0) R1=pkt_end(id=0,off=0,imm=0) R2=pkt(id=0,off=8,r=8,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 7: (71) r0 = *(u8 *)(r0 +0) 8: (b7) r0 = 1 9: (95) exit from 6 to 8: safe processed 16 insns (limit 131072), stack depth 0+0 Basically what happens is that in the subprog we make use of a div/mod by 0 exception and in the 'normal' subprog's exit path we just return skb->data back to the main prog. This has the implication that the verifier thinks we always get a pkt pointer in R0 while we still have the implicit 'return 0' from the div as an alternative unconditional return path earlier. Thus, R0 then contains 0, meaning back in the parent prog we get the address range of [0x0, skb->data_end] as read and writeable. Similar can be crafted with other pointer register types. Since i) BPF_ABS/IND is not allowed in programs that contain BPF to BPF calls (and generally it's also disadvised to use in native eBPF context), ii) unknown opcodes don't return zero anymore, iii) we don't return an exception code in dead branches, the only last missing case affected and to fix is the div/mod handling. What we would really need is some infrastructure to propagate exceptions all the way to the original prog unwinding the current stack and returning that code to the caller of the BPF program. In user space such exception handling for similar runtimes is typically implemented with setjmp(3) and longjmp(3) as one possibility which is not available in the kernel, though (kgdb used to implement it in kernel long time ago). I implemented a PoC exception handling mechanism into the BPF interpreter with porting setjmp()/longjmp() into x86_64 and adding a new internal BPF_ABRT opcode that can use a program specific exception code for all exception cases we have (e.g. div/mod by 0, unknown opcodes, etc). While this seems to work in the constrained BPF environment (meaning, here, we don't need to deal with state e.g. from memory allocations that we would need to undo before going into exception state), it still has various drawbacks: i) we would need to implement the setjmp()/longjmp() for every arch supported in the kernel and for x86_64, arm64, sparc64 JITs currently supporting calls, ii) it has unconditional additional cost on main program entry to store CPU register state in initial setjmp() call, and we would need some way to pass the jmp_buf down into ___bpf_prog_run() for main prog and all subprogs, but also storing on stack is not really nice (other option would be per-cpu storage for this, but it also has the drawback that we need to disable preemption for every BPF program types). All in all this approach would add a lot of complexity. Another poor-man's solution would be to have some sort of additional shared register or scratch buffer to hold state for exceptions, and test that after every call return to chain returns and pass R0 all the way down to BPF prog caller. This is also problematic in various ways: i) an additional register doesn't map well into JITs, and some other scratch space could only be on per-cpu storage, which, again has the side-effect that this only works when we disable preemption, or somewhere in the input context which is not available everywhere either, and ii) this adds significant runtime overhead by putting conditionals after each and every call, as well as implementation complexity. Yet another option is to teach verifier that div/mod can return an integer, which however is also complex to implement as verifier would need to walk such fake 'mov r0,; exit;' sequeuence and there would still be no guarantee for having propagation of this further down to the BPF caller as proper exception code. For parent prog, it is also is not distinguishable from a normal return of a constant scalar value. The approach taken here is a completely different one with little complexity and no additional overhead involved in that we make use of the fact that a div/mod by 0 is undefined behavior. Instead of bailing out, we adapt the same behavior as on some major archs like ARMv8 [0] into eBPF as well: X div 0 results in 0, and X mod 0 results in X. aarch64 and aarch32 ISA do not generate any traps or otherwise aborts of program execution for unsigned divides. I verified this also with a test program compiled by gcc and clang, and the behavior matches with the spec. Going forward we adapt the eBPF verifier to emit such rewrites once div/mod by register was seen. cBPF is not touched and will keep existing 'return 0' semantics. Given the options, it seems the most suitable from all of them, also since major archs have similar schemes in place. Given this is all in the realm of undefined behavior, we still have the option to adapt if deemed necessary and this way we would also have the option of more flexibility from LLVM code generation side (which is then fully visible to verifier). Thus, this patch i) fixes the panic seen in above program and ii) doesn't bypass the verifier observations. [0] ARM Architecture Reference Manual, ARMv8 [ARM DDI 0487B.b] http://infocenter.arm.com/help/topic/com.arm.doc.ddi0487b.b/DDI0487B_b_armv8_arm.pdf 1) aarch64 instruction set: section C3.4.7 and C6.2.279 (UDIV) "A division by zero results in a zero being written to the destination register, without any indication that the division by zero occurred." 2) aarch32 instruction set: section F1.4.8 and F5.1.263 (UDIV) "For the SDIV and UDIV instructions, division by zero always returns a zero result." Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 60d8c8712652..08ab4c65a998 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -459,8 +459,15 @@ do_pass: break; if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || - fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + /* Error with exception code on div/mod by 0. + * For cBPF programs, this was always return 0. + */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn++ = BPF_EXIT_INSN(); + } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; -- cgit From 7c4f63ba824302492985553018881455982241d6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Jan 2018 15:26:32 +0100 Subject: rtnetlink: enable IFLA_IF_NETNSID in do_setlink() RTM_{NEW,SET}LINK already allow operations on other network namespaces by identifying the target network namespace through IFLA_NET_NS_{FD,PID} properties. This is done by looking for the corresponding properties in do_setlink(). Extend do_setlink() to also look for the IFLA_IF_NETNSID property. This introduces no functional changes since all callers of do_setlink() currently block IFLA_IF_NETNSID by reporting an error before they reach do_setlink(). This introduces the helpers: static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct nlattr *tb[]) static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, struct net *src_net, struct nlattr *tb[], int cap) to simplify permission checks and target network namespace retrieval for RTM_* requests that already support IFLA_NET_NS_{FD,PID} but get extended to IFLA_IF_NETNSID. To perserve backwards compatibility the helpers look for IFLA_NET_NS_{FD,PID} properties first before checking for IFLA_IF_NETNSID. Signed-off-by: Christian Brauner Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 54 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 97874daa1336..f7e99c25dfe4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1902,6 +1902,49 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) } EXPORT_SYMBOL(rtnl_link_get_net); +/* Figure out which network namespace we are talking about by + * examining the link attributes in the following order: + * + * 1. IFLA_NET_NS_PID + * 2. IFLA_NET_NS_FD + * 3. IFLA_IF_NETNSID + */ +static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, + struct nlattr *tb[]) +{ + struct net *net; + + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) + return rtnl_link_get_net(src_net, tb); + + if (!tb[IFLA_IF_NETNSID]) + return get_net(src_net); + + net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID])); + if (!net) + return ERR_PTR(-EINVAL); + + return net; +} + +static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, + struct net *src_net, + struct nlattr *tb[], int cap) +{ + struct net *net; + + net = rtnl_link_get_net_by_nlattr(src_net, tb); + if (IS_ERR(net)) + return net; + + if (!netlink_ns_capable(skb, net->user_ns, cap)) { + put_net(net); + return ERR_PTR(-EPERM); + } + + return net; +} + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) { if (dev) { @@ -2164,17 +2207,14 @@ static int do_setlink(const struct sk_buff *skb, const struct net_device_ops *ops = dev->netdev_ops; int err; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { - struct net *net = rtnl_link_get_net(dev_net(dev), tb); + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) { + struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), + tb, CAP_NET_ADMIN); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; } - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { - put_net(net); - err = -EPERM; - goto errout; - } + err = dev_change_net_namespace(dev, net, ifname); put_net(net); if (err) -- cgit From c310bfcb6e1be993629c5747accf8e1c65fbb255 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Jan 2018 15:26:33 +0100 Subject: rtnetlink: enable IFLA_IF_NETNSID for RTM_SETLINK - Backwards Compatibility: If userspace wants to determine whether RTM_SETLINK supports the IFLA_IF_NETNSID property they should first send an RTM_GETLINK request with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply does not include IFLA_IF_NETNSID userspace should assume that IFLA_IF_NETNSID is not supported on this kernel. If the reply does contain an IFLA_IF_NETNSID property userspace can send an RTM_SETLINK with a IFLA_IF_NETNSID property. If they receive EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property with RTM_SETLINK. Userpace should then fallback to other means. To retain backwards compatibility the kernel will first check whether a IFLA_NET_NS_PID or IFLA_NET_NS_FD property has been passed. If either one is found it will be used to identify the target network namespace. This implies that users who do not care whether their running kernel supports IFLA_IF_NETNSID with RTM_SETLINK can pass both IFLA_NET_NS_{FD,PID} and IFLA_IF_NETNSID referring to the same network namespace. - Security: Callers must have CAP_NET_ADMIN in the owning user namespace of the target network namespace. Signed-off-by: Christian Brauner Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f7e99c25dfe4..d0c02943c05a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2555,9 +2555,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; - if (tb[IFLA_IF_NETNSID]) - return -EOPNOTSUPP; - if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else -- cgit From b61ad68a9fe85d29d5363eb36860164a049723cf Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Jan 2018 15:26:34 +0100 Subject: rtnetlink: enable IFLA_IF_NETNSID for RTM_DELLINK - Backwards Compatibility: If userspace wants to determine whether RTM_DELLINK supports the IFLA_IF_NETNSID property they should first send an RTM_GETLINK request with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply does not include IFLA_IF_NETNSID userspace should assume that IFLA_IF_NETNSID is not supported on this kernel. If the reply does contain an IFLA_IF_NETNSID property userspace can send an RTM_DELLINK with a IFLA_IF_NETNSID property. If they receive EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property with RTM_DELLINK. Userpace should then fallback to other means. - Security: Callers must have CAP_NET_ADMIN in the owning user namespace of the target network namespace. Signed-off-by: Christian Brauner Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d0c02943c05a..f111557958bb 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2639,36 +2639,53 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct net_device *dev; + struct net *tgt_net = net; + struct net_device *dev = NULL; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; + int netnsid = -1; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; - if (tb[IFLA_IF_NETNSID]) - return -EOPNOTSUPP; - if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + if (tb[IFLA_IF_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); + tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + + err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = __dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + dev = __dev_get_by_name(tgt_net, ifname); else if (tb[IFLA_GROUP]) - return rtnl_group_dellink(net, nla_get_u32(tb[IFLA_GROUP])); + err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else - return -EINVAL; + goto out; - if (!dev) - return -ENODEV; + if (!dev) { + if (tb[IFLA_IFNAME] || ifm->ifi_index > 0) + err = -ENODEV; - return rtnl_delete_link(dev); + goto out; + } + + err = rtnl_delete_link(dev); + +out: + if (netnsid >= 0) + put_net(tgt_net); + + return err; } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) -- cgit From c36ac8e2307334c83e8bf81ed361f0e4959d995f Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 25 Jan 2018 15:01:38 +0100 Subject: dev: always advertise the new nsid when the netns iface changes The user should be able to follow any interface that moves to another netns. There is no reason to hide physical interfaces. CC: Jiri Benc CC: Christian Brauner Signed-off-by: Nicolas Dichtel Reviewed-by: Jiri Benc Signed-off-by: David S. Miller --- net/core/dev.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4670ccabe23a..59987eb6511a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8529,10 +8529,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) - new_nsid = peernet2id_alloc(dev_net(dev), net); - else - new_nsid = peernet2id(dev_net(dev), net); + new_nsid = peernet2id_alloc(dev_net(dev), net); rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid); /* -- cgit From 38e01b30563a5b5ade7b54e5d739d16a2b02fe82 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 25 Jan 2018 15:01:39 +0100 Subject: dev: advertise the new ifindex when the netns iface changes The goal is to let the user follow an interface that moves to another netns. CC: Jiri Benc CC: Christian Brauner Signed-off-by: Nicolas Dichtel Reviewed-by: Jiri Benc Signed-off-by: David S. Miller --- net/core/dev.c | 19 ++++++++++++------- net/core/rtnetlink.c | 31 ++++++++++++++++++++----------- 2 files changed, 32 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 59987eb6511a..858501b12869 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7360,7 +7360,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL, NULL); + GFP_KERNEL, NULL, 0); /* * Flush the unicast and multicast chains @@ -8473,7 +8473,7 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - int err, new_nsid; + int err, new_nsid, new_ifindex; ASSERT_RTNL(); @@ -8529,8 +8529,16 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + new_nsid = peernet2id_alloc(dev_net(dev), net); - rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid); + /* If there is an ifindex conflict assign a new one */ + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); + else + new_ifindex = dev->ifindex; + + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, + new_ifindex); /* * Flush the unicast and multicast chains @@ -8544,10 +8552,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Actually switch the network namespace */ dev_net_set(dev, net); - - /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) - dev->ifindex = dev_new_index(net); + dev->ifindex = new_ifindex; /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f111557958bb..e04af7b7f448 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -988,6 +988,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + nla_total_size(1) /* IFLA_PROTO_DOWN */ + nla_total_size(4) /* IFLA_IF_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ @@ -1511,7 +1512,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, - u32 event, int *new_nsid, int tgt_netnsid) + u32 event, int *new_nsid, int new_ifindex, + int tgt_netnsid) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1608,6 +1610,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; + if (new_ifindex && + nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) + goto nla_put_failure; + rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) @@ -1853,7 +1859,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0, NULL, + ext_filter_mask, 0, NULL, 0, netnsid); if (err < 0) { @@ -3088,7 +3094,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, - 0, NULL, netnsid); + 0, NULL, 0, netnsid); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -3184,7 +3190,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, - u32 event, gfp_t flags, int *new_nsid) + u32 event, gfp_t flags, int *new_nsid, + int new_ifindex) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -3197,7 +3204,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, 0, 0, change, 0, 0, event, - new_nsid, -1); + new_nsid, new_ifindex, -1); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -3220,14 +3227,15 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, - gfp_t flags, int *new_nsid) + gfp_t flags, int *new_nsid, int new_ifindex) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, + new_ifindex); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } @@ -3235,14 +3243,15 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { - rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL); + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + NULL, 0); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, - gfp_t flags, int *new_nsid) + gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, - new_nsid); + new_nsid, new_ifindex); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -4642,7 +4651,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), - GFP_KERNEL, NULL); + GFP_KERNEL, NULL, 0); break; default: break; -- cgit From 6a643ddb5624be7e0694d49f5765a8d41c1ab6d0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 25 Jan 2018 18:26:22 -0800 Subject: net: introduce helper dev_change_tx_queue_len() This patch promotes the local change_tx_queue_len() to a core helper function, dev_change_tx_queue_len(), so that rtnetlink and net-sysfs could share the code. This also prepares for the following patch. Note, the -EFAULT in the original code doesn't make sense, we should propagate the errno from notifiers. Cc: John Fastabend Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 28 ++++++++++++++++++++++++++++ net/core/net-sysfs.c | 25 +------------------------ net/core/rtnetlink.c | 18 +++++------------- 3 files changed, 34 insertions(+), 37 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 858501b12869..520c24671bc5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7047,6 +7047,34 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) } EXPORT_SYMBOL(dev_set_mtu); +/** + * dev_change_tx_queue_len - Change TX queue length of a netdevice + * @dev: device + * @new_len: new tx queue length + */ +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +{ + unsigned int orig_len = dev->tx_queue_len; + int res; + + if (new_len != (unsigned int)new_len) + return -ERANGE; + + if (new_len != orig_len) { + dev->tx_queue_len = new_len; + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); + res = notifier_to_errno(res); + if (res) { + netdev_err(dev, + "refused to change device tx_queue_len\n"); + dev->tx_queue_len = orig_len; + return res; + } + } + + return 0; +} + /** * dev_set_group - Change group this device belongs to * @dev: device diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c4a28f4667b6..60a5ad2c33ee 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -346,29 +346,6 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW_RW(flags, fmt_hex); -static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) -{ - unsigned int orig_len = dev->tx_queue_len; - int res; - - if (new_len != (unsigned int)new_len) - return -ERANGE; - - if (new_len != orig_len) { - dev->tx_queue_len = new_len; - res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); - res = notifier_to_errno(res); - if (res) { - netdev_err(dev, - "refused to change device tx_queue_len\n"); - dev->tx_queue_len = orig_len; - return -EFAULT; - } - } - - return 0; -} - static ssize_t tx_queue_len_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -376,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev, if (!capable(CAP_NET_ADMIN)) return -EPERM; - return netdev_store(dev, attr, buf, len, change_tx_queue_len); + return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len); } NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e04af7b7f448..061e27cb6c12 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2337,19 +2337,11 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); - unsigned int orig_len = dev->tx_queue_len; - - if (dev->tx_queue_len ^ value) { - dev->tx_queue_len = value; - err = call_netdevice_notifiers( - NETDEV_CHANGE_TX_QUEUE_LEN, dev); - err = notifier_to_errno(err); - if (err) { - dev->tx_queue_len = orig_len; - goto errout; - } - status |= DO_SETLINK_MODIFIED; - } + + err = dev_change_tx_queue_len(dev, value); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GSO_MAX_SIZE]) { -- cgit From 48bfd55e7e4149a304e89c1999436cf52d094a27 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 25 Jan 2018 18:26:23 -0800 Subject: net_sched: plug in qdisc ops change_tx_queue_len Introduce a new qdisc ops ->change_tx_queue_len() so that each qdisc could decide how to implement this if it wants. Previously we simply read dev->tx_queue_len, after pfifo_fast switches to skb array, we need this API to resize the skb array when we change dev->tx_queue_len. To avoid handling race conditions with TX BH, we need to deactivate all TX queues before change the value and bring them back after we are done, this also makes implementation easier. Cc: John Fastabend Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 520c24671bc5..dda9d7b9a840 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7070,6 +7070,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) dev->tx_queue_len = orig_len; return res; } + return dev_qdisc_change_tx_queue_len(dev); } return 0; -- cgit From 40ca54e3a686f13117f3de0c443f8026dadf7c44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Jan 2018 10:58:43 -0800 Subject: net_sched: gen_estimator: fix lockdep splat syzbot reported a lockdep splat in gen_new_estimator() / est_fetch_counters() when attempting to lock est->stats_lock. Since est_fetch_counters() is called from BH context from timer interrupt, we need to block BH as well when calling it from process context. Most qdiscs use per cpu counters and are immune to the problem, but net/sched/act_api.c and net/netfilter/xt_RATEEST.c are using a spinlock to protect their data. They both call gen_new_estimator() while object is created and not yet alive, so this bug could not trigger a deadlock, only a lockdep splat. Fixes: 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/core/gen_estimator.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 9834cfa21b21..0a3f88f08727 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -159,7 +159,11 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, est->intvl_log = intvl_log; est->cpu_bstats = cpu_bstats; + if (stats_lock) + local_bh_disable(); est_fetch_counters(est, &b); + if (stats_lock) + local_bh_enable(); est->last_bytes = b.bytes; est->last_packets = b.packets; old = rcu_dereference_protected(*rate_est, 1); -- cgit From 5bb8ed075428b71492734af66230aa0c07fcc515 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 29 Jan 2018 18:07:20 +0100 Subject: rtnetlink: enable IFLA_IF_NETNSID for RTM_NEWLINK - Backwards Compatibility: If userspace wants to determine whether RTM_NEWLINK supports the IFLA_IF_NETNSID property they should first send an RTM_GETLINK request with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply does not include IFLA_IF_NETNSID userspace should assume that IFLA_IF_NETNSID is not supported on this kernel. If the reply does contain an IFLA_IF_NETNSID property userspace can send an RTM_NEWLINK with a IFLA_IF_NETNSID property. If they receive EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property with RTM_NEWLINK. Userpace should then fallback to other means. - Security: Callers must have CAP_NET_ADMIN in the owning user namespace of the target network namespace. Signed-off-by: Christian Brauner Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 061e27cb6c12..204297dffd2a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2952,14 +2952,10 @@ replay: name_assign_type = NET_NAME_ENUM; } - dest_net = rtnl_link_get_net(net, tb); + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); if (IS_ERR(dest_net)) return PTR_ERR(dest_net); - err = -EPERM; - if (!netlink_ns_capable(skb, dest_net->user_ns, CAP_NET_ADMIN)) - goto out; - if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); -- cgit