From b92cf4aab8e688b1bd501ac2ac4f1b5c99601e3b Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:40:22 -0500
Subject: net: Create and use new helper xfrm_dst_child().

Only IPSEC routes have a non-NULL dst->child pointer.  And IPSEC
routes are identified by a non-NULL dst->xfrm pointer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dst.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dst.c b/net/core/dst.c
index 662a2d4a3d19..6a3c21b8fc8d 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -116,12 +116,14 @@ EXPORT_SYMBOL(dst_alloc);
 
 struct dst_entry *dst_destroy(struct dst_entry * dst)
 {
-	struct dst_entry *child;
+	struct dst_entry *child = NULL;
 
 	smp_rmb();
 
-	child = dst->child;
-
+#ifdef CONFIG_XFRM
+	if (dst->xfrm)
+		child = dst->child;
+#endif
 	if (!(dst->flags & DST_NOCOUNT))
 		dst_entries_add(dst->ops, -1);
 
-- 
cgit 


From b6ca8bd5a9198c70c48297390723e4e56bd6e879 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:45:44 -0500
Subject: xfrm: Move child route linkage into xfrm_dst.

XFRM bundle child chains look like this:

	xdst1 --> xdst2 --> xdst3 --> path_dst

All of xdstN are xfrm_dst objects and xdst->u.dst.xfrm is non-NULL.
The final child pointer in the chain, here called 'path_dst', is some
other kind of route such as an ipv4 or ipv6 one.

The xfrm output path pops routes, one at a time, via the child
pointer, until we hit one which has a dst->xfrm pointer which
is NULL.

We can easily preserve the above mechanisms with child sitting
only in the xfrm_dst structure.  All children in the chain
before we break out of the xfrm_output() loop have dst->xfrm
non-NULL and are therefore xfrm_dst objects.

Since we break out of the loop when we find dst->xfrm NULL, we
will not try to dereference 'dst' as if it were an xfrm_dst.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dst.c    |  9 ++++++---
 net/core/pktgen.c | 12 ++++++------
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dst.c b/net/core/dst.c
index 6a3c21b8fc8d..5cf96179e8e0 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -21,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/prefetch.h>
 #include <net/lwtunnel.h>
+#include <net/xfrm.h>
 
 #include <net/dst.h>
 #include <net/dst_metadata.h>
@@ -62,7 +63,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	      struct net_device *dev, int initial_ref, int initial_obsolete,
 	      unsigned short flags)
 {
-	dst->child = NULL;
 	dst->dev = dev;
 	if (dev)
 		dev_hold(dev);
@@ -121,8 +121,11 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
 	smp_rmb();
 
 #ifdef CONFIG_XFRM
-	if (dst->xfrm)
-		child = dst->child;
+	if (dst->xfrm) {
+		struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
+
+		child = xdst->child;
+	}
 #endif
 	if (!(dst->flags & DST_NOCOUNT))
 		dst_entries_add(dst->ops, -1);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f95a15086225..b9ce241cd28c 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -399,7 +399,7 @@ struct pktgen_dev {
 	__u8	ipsmode;		/* IPSEC mode (config) */
 	__u8	ipsproto;		/* IPSEC type (config) */
 	__u32	spi;
-	struct dst_entry dst;
+	struct xfrm_dst xdst;
 	struct dst_ops dstops;
 #endif
 	char result[512];
@@ -2609,7 +2609,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
 	 * supports both transport/tunnel mode + ESP/AH type.
 	 */
 	if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0))
-		skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF;
+		skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF;
 
 	rcu_read_lock_bh();
 	err = x->outer_mode->output(x, skb);
@@ -3742,10 +3742,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 	 * performance under such circumstance.
 	 */
 	pkt_dev->dstops.family = AF_INET;
-	pkt_dev->dst.dev = pkt_dev->odev;
-	dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false);
-	pkt_dev->dst.child = &pkt_dev->dst;
-	pkt_dev->dst.ops = &pkt_dev->dstops;
+	pkt_dev->xdst.u.dst.dev = pkt_dev->odev;
+	dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false);
+	pkt_dev->xdst.child = &pkt_dev->xdst.u.dst;
+	pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops;
 #endif
 
 	return add_dev_to_thread(t, pkt_dev);
-- 
cgit 


From 3a2232e92e87166a8a5113e918b8c7b7bdce4d83 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:40:40 -0500
Subject: ipv6: Move dst->from into struct rt6_info.

The dst->from value is only used by ipv6 routes to track where
a route "came from".

Any time we clone or copy a core ipv6 route in the ipv6 routing
tables, we have the copy/clone's ->from point to the base route.

This is used to handle route expiration properly.

Only ipv6 uses this mechanism, and only ipv6 code references
it.  So it is safe to move it into rt6_info.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 net/core/dst.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dst.c b/net/core/dst.c
index 5cf96179e8e0..cf2076c0eb22 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -70,7 +70,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst_init_metrics(dst, dst_default_metrics.metrics, true);
 	dst->expires = 0UL;
 	dst->path = dst;
-	dst->from = NULL;
 #ifdef CONFIG_XFRM
 	dst->xfrm = NULL;
 #endif
-- 
cgit 


From 0f6c480f23f49b53644b383c5554e579498347f3 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:40:46 -0500
Subject: xfrm: Move dst->path into struct xfrm_dst

The first member of an IPSEC route bundle chain sets it's dst->path to
the underlying ipv4/ipv6 route that carries the bundle.

Stated another way, if one were to follow the xfrm_dst->child chain of
the bundle, the final non-NULL pointer would be the path and point to
either an ipv4 or an ipv6 route.

This is largely used to make sure that PMTU events propagate down to
the correct ipv4 or ipv6 route.

When we don't have the top of an IPSEC bundle 'dst->path == dst'.

Move it down into xfrm_dst and key off of dst->xfrm.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 net/core/dst.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dst.c b/net/core/dst.c
index cf2076c0eb22..9bc3bb6e94ef 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -69,7 +69,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst->ops = ops;
 	dst_init_metrics(dst, dst_default_metrics.metrics, true);
 	dst->expires = 0UL;
-	dst->path = dst;
 #ifdef CONFIG_XFRM
 	dst->xfrm = NULL;
 #endif
-- 
cgit 


From 7149f813d12d92ba9abcf49026f7cebc3d55c426 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 28 Nov 2017 15:41:07 -0500
Subject: net: Remove dst->next

There are no more users.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 net/core/dst.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dst.c b/net/core/dst.c
index 9bc3bb6e94ef..007aa0b08291 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -86,7 +86,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst->__use = 0;
 	dst->lastuse = jiffies;
 	dst->flags = flags;
-	dst->next = NULL;
 	if (!(flags & DST_NOCOUNT))
 		dst_entries_add(ops, 1);
 }
-- 
cgit 


From e94a62f507f9498e5f4b2c69ef181b3402934c2a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 30 Nov 2017 15:39:34 +0100
Subject: net/reuseport: drop legacy code

Since commit e32ea7e74727 ("soreuseport: fast reuseport UDP socket
selection") and commit c125e80b8868 ("soreuseport: fast reuseport
TCP socket selection") the relevant reuseport socket matching the current
packet is selected by the reuseport_select_sock() call. The only
exceptions are invalid BPF filters/filters returning out-of-range
indices.
In the latter case the code implicitly falls back to using the hash
demultiplexing, but instead of selecting the socket inside the
reuseport_select_sock() function, it relies on the hash selection
logic introduced with the early soreuseport implementation.

With this patch, in case of a BPF filter returning a bad socket
index value, we fall back to hash-based selection inside the
reuseport_select_sock() body, so that we can drop some duplicate
code in the ipv4 and ipv6 stack.

This also allows faster lookup in the above scenario and will allow
us to avoid computing the hash value for successful, BPF based
demultiplexing - in a later patch.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock_reuseport.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 5eeb1d20cc38..c5bb52bc73a1 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -235,7 +235,9 @@ struct sock *reuseport_select_sock(struct sock *sk,
 
 		if (prog && skb)
 			sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
-		else
+
+		/* no bpf or invalid bpf result: fall back to hash usage */
+		if (!sk2)
 			sk2 = reuse->socks[reciprocal_scale(hash, socks)];
 	}
 
-- 
cgit 


From 118b4aa25d90d0930611b71dd28a749c67309ccb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 1 Dec 2017 15:08:55 -0800
Subject: net: xdp: avoid output parameters when querying XDP prog

The output parameters will get unwieldy if we want to add more
information about the program.  Simply pass the entire
struct netdev_bpf in.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c       | 24 ++++++++++++++----------
 net/core/rtnetlink.c |  6 +++++-
 2 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 07ed21d64f92..3f271c9cb5e0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7073,17 +7073,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id)
+void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+		     struct netdev_bpf *xdp)
 {
-	struct netdev_bpf xdp;
-
-	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = XDP_QUERY_PROG;
+	memset(xdp, 0, sizeof(*xdp));
+	xdp->command = XDP_QUERY_PROG;
 
 	/* Query must always succeed. */
-	WARN_ON(bpf_op(dev, &xdp) < 0);
-	if (prog_id)
-		*prog_id = xdp.prog_id;
+	WARN_ON(bpf_op(dev, xdp) < 0);
+}
+
+static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
+{
+	struct netdev_bpf xdp;
+
+	__dev_xdp_query(dev, bpf_op, &xdp);
 
 	return xdp.prog_attached;
 }
@@ -7134,10 +7138,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		bpf_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL))
+		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
 			return -EEXIST;
 		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-		    __dev_xdp_attached(dev, bpf_op, NULL))
+		    __dev_xdp_attached(dev, bpf_op))
 			return -EBUSY;
 
 		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..9c4cb584bfb0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1261,6 +1261,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	const struct bpf_prog *generic_xdp_prog;
+	struct netdev_bpf xdp;
 
 	ASSERT_RTNL();
 
@@ -1273,7 +1274,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 	if (!ops->ndo_bpf)
 		return XDP_ATTACHED_NONE;
 
-	return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id);
+	__dev_xdp_query(dev, ops->ndo_bpf, &xdp);
+	*prog_id = xdp.prog_id;
+
+	return xdp.prog_attached;
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
-- 
cgit 


From bd0b2e7fe611953470ec7c533b455fb2abd382cd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 1 Dec 2017 15:08:57 -0800
Subject: net: xdp: make the stack take care of the tear down

Since day one of XDP drivers had to remember to free the program
on the remove path.  This leads to code duplication and is error
prone.  Make the stack query the installed programs on unregister
and if something is installed, remove the program.  Freeing of
program attached to XDP generic is moved from free_netdev() as well.

Because the remove will now be called before notifiers are
invoked, BPF offload state of the program will not get destroyed
before uninstall.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 3f271c9cb5e0..6bea8931bb62 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7110,6 +7110,27 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 	return bpf_op(dev, &xdp);
 }
 
+static void dev_xdp_uninstall(struct net_device *dev)
+{
+	struct netdev_bpf xdp;
+	bpf_op_t ndo_bpf;
+
+	/* Remove generic XDP */
+	WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
+
+	/* Remove from the driver */
+	ndo_bpf = dev->netdev_ops->ndo_bpf;
+	if (!ndo_bpf)
+		return;
+
+	__dev_xdp_query(dev, ndo_bpf, &xdp);
+	if (xdp.prog_attached == XDP_ATTACHED_NONE)
+		return;
+
+	/* Program removal should always succeed */
+	WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
+}
+
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
@@ -7240,6 +7261,7 @@ static void rollback_registered_many(struct list_head *head)
 		/* Shutdown queueing discipline. */
 		dev_shutdown(dev);
 
+		dev_xdp_uninstall(dev);
 
 		/* Notify protocols, that we are about to destroy
 		 * this device. They should clean all the things.
@@ -8199,7 +8221,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
 void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
-	struct bpf_prog *prog;
 
 	might_sleep();
 	netif_free_tx_queues(dev);
@@ -8218,12 +8239,6 @@ void free_netdev(struct net_device *dev)
 	free_percpu(dev->pcpu_refcnt);
 	dev->pcpu_refcnt = NULL;
 
-	prog = rcu_dereference_protected(dev->xdp_prog, 1);
-	if (prog) {
-		bpf_prog_put(prog);
-		static_key_slow_dec(&generic_xdp_needed);
-	}
-
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		netdev_freemem(dev);
-- 
cgit 


From b8da518c6e87504a6790bfb1539134c41b61a45a Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 1 Dec 2017 15:26:09 -0800
Subject: bpf: allow disabling tunnel csum for ipv6

Before the patch, BPF_F_ZERO_CSUM_TX can be used only for ipv4 tunnel.
With introduction of ip6gretap collect_md mode, the flag should be also
supported for ipv6.

Signed-off-by: William Tu <u9012063@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 6a85e67fafce..8ec5a504eb28 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3026,10 +3026,11 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 				  IPV6_FLOWLABEL_MASK;
 	} else {
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
-		if (flags & BPF_F_ZERO_CSUM_TX)
-			info->key.tun_flags &= ~TUNNEL_CSUM;
 	}
 
+	if (flags & BPF_F_ZERO_CSUM_TX)
+		info->key.tun_flags &= ~TUNNEL_CSUM;
+
 	return 0;
 }
 
-- 
cgit 


From addf9b90de22f7aaad0db39bccb5d51ac47dd4e1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 2 Dec 2017 21:44:05 +0100
Subject: net: rtnetlink: use rcu to free rtnl message handlers

rtnetlink is littered with READ_ONCE() because we can have read accesses
while another cpu can write to the structure we're reading by
(un)registering doit or dumpit handlers.

This patch changes this so that (un)registering cpu allocates a new
structure and then publishes it via rcu_assign_pointer, i.e. once
another cpu can see such pointer no modifications will occur anymore.

based on initial patch from Peter Zijlstra.

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 154 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 101 insertions(+), 53 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..ff292d3f2c41 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -63,6 +63,7 @@ struct rtnl_link {
 	rtnl_doit_func		doit;
 	rtnl_dumpit_func	dumpit;
 	unsigned int		flags;
+	struct rcu_head		rcu;
 };
 
 static DEFINE_MUTEX(rtnl_mutex);
@@ -127,7 +128,7 @@ bool lockdep_rtnl_is_held(void)
 EXPORT_SYMBOL(lockdep_rtnl_is_held);
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 
-static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+static struct rtnl_link __rcu **rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
 static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1];
 
 static inline int rtm_msgindex(int msgtype)
@@ -144,6 +145,20 @@ static inline int rtm_msgindex(int msgtype)
 	return msgindex;
 }
 
+static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
+{
+	struct rtnl_link **tab;
+
+	if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
+		protocol = PF_UNSPEC;
+
+	tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]);
+	if (!tab)
+		tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);
+
+	return tab[msgtype];
+}
+
 /**
  * __rtnl_register - Register a rtnetlink message type
  * @protocol: Protocol family or PF_UNSPEC
@@ -166,28 +181,52 @@ int __rtnl_register(int protocol, int msgtype,
 		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
 		    unsigned int flags)
 {
-	struct rtnl_link *tab;
+	struct rtnl_link **tab, *link, *old;
 	int msgindex;
+	int ret = -ENOBUFS;
 
 	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 	msgindex = rtm_msgindex(msgtype);
 
-	tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]);
+	rtnl_lock();
+	tab = rtnl_msg_handlers[protocol];
 	if (tab == NULL) {
-		tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);
-		if (tab == NULL)
-			return -ENOBUFS;
+		tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
+		if (!tab)
+			goto unlock;
 
+		/* ensures we see the 0 stores */
 		rcu_assign_pointer(rtnl_msg_handlers[protocol], tab);
 	}
 
+	old = rtnl_dereference(tab[msgindex]);
+	if (old) {
+		link = kmemdup(old, sizeof(*old), GFP_KERNEL);
+		if (!link)
+			goto unlock;
+	} else {
+		link = kzalloc(sizeof(*link), GFP_KERNEL);
+		if (!link)
+			goto unlock;
+	}
+
+	WARN_ON(doit && link->doit && link->doit != doit);
 	if (doit)
-		tab[msgindex].doit = doit;
+		link->doit = doit;
+	WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit);
 	if (dumpit)
-		tab[msgindex].dumpit = dumpit;
-	tab[msgindex].flags |= flags;
+		link->dumpit = dumpit;
 
-	return 0;
+	link->flags |= flags;
+
+	/* publish protocol:msgtype */
+	rcu_assign_pointer(tab[msgindex], link);
+	ret = 0;
+	if (old)
+		kfree_rcu(old, rcu);
+unlock:
+	rtnl_unlock();
+	return ret;
 }
 EXPORT_SYMBOL_GPL(__rtnl_register);
 
@@ -220,24 +259,25 @@ EXPORT_SYMBOL_GPL(rtnl_register);
  */
 int rtnl_unregister(int protocol, int msgtype)
 {
-	struct rtnl_link *handlers;
+	struct rtnl_link **tab, *link;
 	int msgindex;
 
 	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 	msgindex = rtm_msgindex(msgtype);
 
 	rtnl_lock();
-	handlers = rtnl_dereference(rtnl_msg_handlers[protocol]);
-	if (!handlers) {
+	tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
+	if (!tab) {
 		rtnl_unlock();
 		return -ENOENT;
 	}
 
-	handlers[msgindex].doit = NULL;
-	handlers[msgindex].dumpit = NULL;
-	handlers[msgindex].flags = 0;
+	link = tab[msgindex];
+	rcu_assign_pointer(tab[msgindex], NULL);
 	rtnl_unlock();
 
+	kfree_rcu(link, rcu);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rtnl_unregister);
@@ -251,20 +291,29 @@ EXPORT_SYMBOL_GPL(rtnl_unregister);
  */
 void rtnl_unregister_all(int protocol)
 {
-	struct rtnl_link *handlers;
+	struct rtnl_link **tab, *link;
+	int msgindex;
 
 	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 
 	rtnl_lock();
-	handlers = rtnl_dereference(rtnl_msg_handlers[protocol]);
+	tab = rtnl_msg_handlers[protocol];
 	RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
+	for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
+		link = tab[msgindex];
+		if (!link)
+			continue;
+
+		rcu_assign_pointer(tab[msgindex], NULL);
+		kfree_rcu(link, rcu);
+	}
 	rtnl_unlock();
 
 	synchronize_net();
 
 	while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1)
 		schedule();
-	kfree(handlers);
+	kfree(tab);
 }
 EXPORT_SYMBOL_GPL(rtnl_unregister_all);
 
@@ -2973,18 +3022,26 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 		s_idx = 1;
 
 	for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
+		struct rtnl_link **tab;
 		int type = cb->nlh->nlmsg_type-RTM_BASE;
-		struct rtnl_link *handlers;
+		struct rtnl_link *link;
 		rtnl_dumpit_func dumpit;
 
 		if (idx < s_idx || idx == PF_PACKET)
 			continue;
 
-		handlers = rtnl_dereference(rtnl_msg_handlers[idx]);
-		if (!handlers)
+		if (type < 0 || type >= RTM_NR_MSGTYPES)
 			continue;
 
-		dumpit = READ_ONCE(handlers[type].dumpit);
+		tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]);
+		if (!tab)
+			continue;
+
+		link = tab[type];
+		if (!link)
+			continue;
+
+		dumpit = link->dumpit;
 		if (!dumpit)
 			continue;
 
@@ -4314,7 +4371,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
-	struct rtnl_link *handlers;
+	struct rtnl_link *link;
 	int err = -EOPNOTSUPP;
 	rtnl_doit_func doit;
 	unsigned int flags;
@@ -4338,32 +4395,20 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
-	if (family >= ARRAY_SIZE(rtnl_msg_handlers))
-		family = PF_UNSPEC;
-
 	rcu_read_lock();
-	handlers = rcu_dereference(rtnl_msg_handlers[family]);
-	if (!handlers) {
-		family = PF_UNSPEC;
-		handlers = rcu_dereference(rtnl_msg_handlers[family]);
-	}
-
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
 		struct sock *rtnl;
 		rtnl_dumpit_func dumpit;
 		u16 min_dump_alloc = 0;
 
-		dumpit = READ_ONCE(handlers[type].dumpit);
-		if (!dumpit) {
+		link = rtnl_get_link(family, type);
+		if (!link || !link->dumpit) {
 			family = PF_UNSPEC;
-			handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]);
-			if (!handlers)
-				goto err_unlock;
-
-			dumpit = READ_ONCE(handlers[type].dumpit);
-			if (!dumpit)
+			link = rtnl_get_link(family, type);
+			if (!link || !link->dumpit)
 				goto err_unlock;
 		}
+		dumpit = link->dumpit;
 
 		refcount_inc(&rtnl_msg_handlers_ref[family]);
 
@@ -4384,33 +4429,36 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return err;
 	}
 
-	doit = READ_ONCE(handlers[type].doit);
-	if (!doit) {
+	link = rtnl_get_link(family, type);
+	if (!link || !link->doit) {
 		family = PF_UNSPEC;
-		handlers = rcu_dereference(rtnl_msg_handlers[family]);
+		link = rtnl_get_link(PF_UNSPEC, type);
+		if (!link || !link->doit)
+			goto out_unlock;
 	}
 
-	flags = READ_ONCE(handlers[type].flags);
+	flags = link->flags;
 	if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
 		refcount_inc(&rtnl_msg_handlers_ref[family]);
-		doit = READ_ONCE(handlers[type].doit);
+		doit = link->doit;
 		rcu_read_unlock();
 		if (doit)
 			err = doit(skb, nlh, extack);
 		refcount_dec(&rtnl_msg_handlers_ref[family]);
 		return err;
 	}
-
 	rcu_read_unlock();
 
 	rtnl_lock();
-	handlers = rtnl_dereference(rtnl_msg_handlers[family]);
-	if (handlers) {
-		doit = READ_ONCE(handlers[type].doit);
-		if (doit)
-			err = doit(skb, nlh, extack);
-	}
+	link = rtnl_get_link(family, type);
+	if (link && link->doit)
+		err = link->doit(skb, nlh, extack);
 	rtnl_unlock();
+
+	return err;
+
+out_unlock:
+	rcu_read_unlock();
 	return err;
 
 err_unlock:
-- 
cgit 


From e4202511480da5f8e6870d8f6ecbb821aeaa8caf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 2 Dec 2017 21:44:06 +0100
Subject: rtnetlink: get reference on module before invoking handlers

Add yet another rtnl_register function.  It will be used by modules
that can be removed.

The passed module struct is used to prevent module unload while
a netlink dump is in progress or when a DOIT_UNLOCKED doit callback
is called.

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 113 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 78 insertions(+), 35 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ff292d3f2c41..de6390365c90 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -62,6 +62,7 @@
 struct rtnl_link {
 	rtnl_doit_func		doit;
 	rtnl_dumpit_func	dumpit;
+	struct module		*owner;
 	unsigned int		flags;
 	struct rcu_head		rcu;
 };
@@ -129,7 +130,6 @@ EXPORT_SYMBOL(lockdep_rtnl_is_held);
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 
 static struct rtnl_link __rcu **rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
-static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1];
 
 static inline int rtm_msgindex(int msgtype)
 {
@@ -159,27 +159,10 @@ static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
 	return tab[msgtype];
 }
 
-/**
- * __rtnl_register - Register a rtnetlink message type
- * @protocol: Protocol family or PF_UNSPEC
- * @msgtype: rtnetlink message type
- * @doit: Function pointer called for each request message
- * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
- *
- * Registers the specified function pointers (at least one of them has
- * to be non-NULL) to be called whenever a request message for the
- * specified protocol family and message type is received.
- *
- * The special protocol family PF_UNSPEC may be used to define fallback
- * function pointers for the case when no entry for the specific protocol
- * family exists.
- *
- * Returns 0 on success or a negative error code.
- */
-int __rtnl_register(int protocol, int msgtype,
-		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
-		    unsigned int flags)
+static int rtnl_register_internal(struct module *owner,
+				  int protocol, int msgtype,
+				  rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+				  unsigned int flags)
 {
 	struct rtnl_link **tab, *link, *old;
 	int msgindex;
@@ -210,6 +193,9 @@ int __rtnl_register(int protocol, int msgtype,
 			goto unlock;
 	}
 
+	WARN_ON(link->owner && link->owner != owner);
+	link->owner = owner;
+
 	WARN_ON(doit && link->doit && link->doit != doit);
 	if (doit)
 		link->doit = doit;
@@ -228,6 +214,54 @@ unlock:
 	rtnl_unlock();
 	return ret;
 }
+
+/**
+ * rtnl_register_module - Register a rtnetlink message type
+ *
+ * @owner: module registering the hook (THIS_MODULE)
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ *
+ * Like rtnl_register, but for use by removable modules.
+ */
+int rtnl_register_module(struct module *owner,
+			 int protocol, int msgtype,
+			 rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+			 unsigned int flags)
+{
+	return rtnl_register_internal(owner, protocol, msgtype,
+				      doit, dumpit, flags);
+}
+EXPORT_SYMBOL_GPL(rtnl_register_module);
+
+/**
+ * __rtnl_register - Register a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_register(int protocol, int msgtype,
+		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+		    unsigned int flags)
+{
+	return rtnl_register_internal(NULL, protocol, msgtype,
+				      doit, dumpit, flags);
+}
 EXPORT_SYMBOL_GPL(__rtnl_register);
 
 /**
@@ -311,8 +345,6 @@ void rtnl_unregister_all(int protocol)
 
 	synchronize_net();
 
-	while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1)
-		schedule();
 	kfree(tab);
 }
 EXPORT_SYMBOL_GPL(rtnl_unregister_all);
@@ -4372,6 +4404,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 {
 	struct net *net = sock_net(skb->sk);
 	struct rtnl_link *link;
+	struct module *owner;
 	int err = -EOPNOTSUPP;
 	rtnl_doit_func doit;
 	unsigned int flags;
@@ -4408,24 +4441,32 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 			if (!link || !link->dumpit)
 				goto err_unlock;
 		}
+		owner = link->owner;
 		dumpit = link->dumpit;
 
-		refcount_inc(&rtnl_msg_handlers_ref[family]);
-
 		if (type == RTM_GETLINK - RTM_BASE)
 			min_dump_alloc = rtnl_calcit(skb, nlh);
 
+		err = 0;
+		/* need to do this before rcu_read_unlock() */
+		if (!try_module_get(owner))
+			err = -EPROTONOSUPPORT;
+
 		rcu_read_unlock();
 
 		rtnl = net->rtnl;
-		{
+		if (err == 0) {
 			struct netlink_dump_control c = {
 				.dump		= dumpit,
 				.min_dump_alloc	= min_dump_alloc,
+				.module		= owner,
 			};
 			err = netlink_dump_start(rtnl, skb, nlh, &c);
+			/* netlink_dump_start() will keep a reference on
+			 * module if dump is still in progress.
+			 */
+			module_put(owner);
 		}
-		refcount_dec(&rtnl_msg_handlers_ref[family]);
 		return err;
 	}
 
@@ -4437,14 +4478,19 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 			goto out_unlock;
 	}
 
+	owner = link->owner;
+	if (!try_module_get(owner)) {
+		err = -EPROTONOSUPPORT;
+		goto out_unlock;
+	}
+
 	flags = link->flags;
 	if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
-		refcount_inc(&rtnl_msg_handlers_ref[family]);
 		doit = link->doit;
 		rcu_read_unlock();
 		if (doit)
 			err = doit(skb, nlh, extack);
-		refcount_dec(&rtnl_msg_handlers_ref[family]);
+		module_put(owner);
 		return err;
 	}
 	rcu_read_unlock();
@@ -4455,6 +4501,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		err = link->doit(skb, nlh, extack);
 	rtnl_unlock();
 
+	module_put(owner);
+
 	return err;
 
 out_unlock:
@@ -4546,11 +4594,6 @@ static struct pernet_operations rtnetlink_net_ops = {
 
 void __init rtnetlink_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++)
-		refcount_set(&rtnl_msg_handlers_ref[i], 1);
-
 	if (register_pernet_subsys(&rtnetlink_net_ops))
 		panic("rtnetlink_init: cannot initialize rtnetlink\n");
 
-- 
cgit 


From 16feebcf2350aa369001a529f50ce33f2472c01c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 2 Dec 2017 21:44:08 +0100
Subject: rtnetlink: remove __rtnl_register

This removes __rtnl_register and switches callers to either
rtnl_register or rtnl_register_module.

Also, rtnl_register() will now print an error if memory allocation
failed rather than panic the kernel.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index de6390365c90..fb2d61df1e2f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -238,7 +238,7 @@ int rtnl_register_module(struct module *owner,
 EXPORT_SYMBOL_GPL(rtnl_register_module);
 
 /**
- * __rtnl_register - Register a rtnetlink message type
+ * rtnl_register - Register a rtnetlink message type
  * @protocol: Protocol family or PF_UNSPEC
  * @msgtype: rtnetlink message type
  * @doit: Function pointer called for each request message
@@ -252,35 +252,18 @@ EXPORT_SYMBOL_GPL(rtnl_register_module);
  * The special protocol family PF_UNSPEC may be used to define fallback
  * function pointers for the case when no entry for the specific protocol
  * family exists.
- *
- * Returns 0 on success or a negative error code.
- */
-int __rtnl_register(int protocol, int msgtype,
-		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
-		    unsigned int flags)
-{
-	return rtnl_register_internal(NULL, protocol, msgtype,
-				      doit, dumpit, flags);
-}
-EXPORT_SYMBOL_GPL(__rtnl_register);
-
-/**
- * rtnl_register - Register a rtnetlink message type
- *
- * Identical to __rtnl_register() but panics on failure. This is useful
- * as failure of this function is very unlikely, it can only happen due
- * to lack of memory when allocating the chain to store all message
- * handlers for a protocol. Meant for use in init functions where lack
- * of memory implies no sense in continuing.
  */
 void rtnl_register(int protocol, int msgtype,
 		   rtnl_doit_func doit, rtnl_dumpit_func dumpit,
 		   unsigned int flags)
 {
-	if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0)
-		panic("Unable to register rtnetlink message handler, "
-		      "protocol = %d, message type = %d\n",
-		      protocol, msgtype);
+	int err;
+
+	err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
+				     flags);
+	if (err)
+		pr_err("Unable to register rtnetlink message handler, "
+		       "protocol = %d, message type = %d\n", protocol, msgtype);
 }
 EXPORT_SYMBOL_GPL(rtnl_register);
 
-- 
cgit 


From a3fde2addd5f0218b64102005a237ef727b0dc30 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 Dec 2017 19:19:18 +0100
Subject: rtnetlink: ipv6: convert remaining users to rtnl_register_module

convert remaining users of rtnl_register to rtnl_register_module
and un-export rtnl_register.

Requested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a55d0c236b40..642b3afb12b9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -265,7 +265,6 @@ void rtnl_register(int protocol, int msgtype,
 		pr_err("Unable to register rtnetlink message handler, "
 		       "protocol = %d, message type = %d\n", protocol, msgtype);
 }
-EXPORT_SYMBOL_GPL(rtnl_register);
 
 /**
  * rtnl_unregister - Unregister a rtnetlink message type
-- 
cgit 


From 792f3dd6f02b25afc0b42adce2efd523e81c3fb1 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Mon, 4 Dec 2017 14:18:29 -0800
Subject: bpf: move bpf csum flag check

trivial move the BPF_F_ZERO_CSUM_TX check right below the
'flags & BPF_F_DONT_FRAGMENT', so common tun_flags handling
is logically together.

Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 8ec5a504eb28..4d644ad17457 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3013,6 +3013,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
 	if (flags & BPF_F_DONT_FRAGMENT)
 		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+	if (flags & BPF_F_ZERO_CSUM_TX)
+		info->key.tun_flags &= ~TUNNEL_CSUM;
 
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
@@ -3028,9 +3030,6 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
 	}
 
-	if (flags & BPF_F_ZERO_CSUM_TX)
-		info->key.tun_flags &= ~TUNNEL_CSUM;
-
 	return 0;
 }
 
-- 
cgit 


From f19397a5c65665d66e3866b42056f1f58b7a366b Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 1 Dec 2017 10:15:04 -0800
Subject: bpf: Add access to snd_cwnd and others in sock_ops

Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these
fields are only valid if the socket associated with the sock_ops program
call is a full socket, the field is_fullsock is also added to the
bpf_sock_ops struct. If the socket is not a full socket, reading these
fields returns 0.

Note that in most cases it will not be necessary to check is_fullsock to
know if there is a full socket. The context of the call, as specified by
the 'op' field, can sometimes determine whether there is a full socket.

The struct bpf_sock_ops has the following fields added:

  __u32 is_fullsock;      /* Some TCP fields are only valid if
                           * there is a full socket. If not, the
                           * fields read as zero.
			   */
  __u32 snd_cwnd;
  __u32 srtt_us;          /* Averaged RTT << 3 in usecs */

There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add
read access to more 32 bit tcp_sock fields.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 4d644ad17457..754abe1041b7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4437,6 +4437,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_num));
 		break;
+
+	case offsetof(struct bpf_sock_ops, is_fullsock):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern,
+						is_fullsock),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern,
+					       is_fullsock));
+		break;
+
+/* Helper macro for adding read access to tcp_sock fields. */
+#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+	do {								      \
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      si->dst_reg, si->src_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      si->dst_reg, si->src_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
+				      offsetof(struct tcp_sock, FIELD_NAME)); \
+	} while (0)
+
+	case offsetof(struct bpf_sock_ops, snd_cwnd):
+		SOCK_OPS_GET_TCP32(snd_cwnd);
+		break;
+
+	case offsetof(struct bpf_sock_ops, srtt_us):
+		SOCK_OPS_GET_TCP32(srtt_us);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
cgit 


From b0e9fe1ba7f8305dc1c640fbeb1b8c5c609e604c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 Dec 2017 22:42:30 +0100
Subject: rtnetlink: fix rtnl_link msghandler rcu annotations

Incorrect/missing annotations caused a few sparse warnings:

rtnetlink.c:155:15: incompatible types .. (different address spaces)
rtnetlink.c:157:23: incompatible types .. (different address spaces)
rtnetlink.c:185:15: incompatible types .. (different address spaces)
rtnetlink.c:285:15: incompatible types .. (different address spaces)
rtnetlink.c:317:9: incompatible types .. (different address spaces)
rtnetlink.c:3054:23: incompatible types .. (different address spaces)

no change in generated code.

Fixes: addf9b90de22f7 ("net: rtnetlink: use rcu to free rtnl message handlers")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 642b3afb12b9..a4faefd65006 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -129,7 +129,7 @@ bool lockdep_rtnl_is_held(void)
 EXPORT_SYMBOL(lockdep_rtnl_is_held);
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 
-static struct rtnl_link __rcu **rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
 
 static inline int rtm_msgindex(int msgtype)
 {
@@ -164,7 +164,8 @@ static int rtnl_register_internal(struct module *owner,
 				  rtnl_doit_func doit, rtnl_dumpit_func dumpit,
 				  unsigned int flags)
 {
-	struct rtnl_link **tab, *link, *old;
+	struct rtnl_link *link, *old;
+	struct rtnl_link __rcu **tab;
 	int msgindex;
 	int ret = -ENOBUFS;
 
-- 
cgit 


From 62b32379fd124fea521484ba7e220d8a449f0b59 Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Mon, 4 Dec 2017 11:31:48 +0100
Subject: flow_dissector: dissect tunnel info outside __skb_flow_dissect()

Move dissection of tunnel info to outside of the main flow dissection
function, __skb_flow_dissect(). The sole user of this feature, the flower
classifier, is updated to call tunnel info dissection directly, using
skb_flow_dissect_tunnel_info().

This results in a slightly less complex implementation of
__skb_flow_dissect(), in particular removing logic from that call path
which is not used by the majority of users. The expense of this is borne by
the flower classifier which now has to make an extra call for tunnel info
dissection.

This patch should not result in any behavioural change.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 15ce30063765..cc75488d3653 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -133,10 +133,10 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
 	ctrl->addr_type = type;
 }
 
-static void
-__skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
-			       struct flow_dissector *flow_dissector,
-			       void *target_container)
+void
+skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
+			     struct flow_dissector *flow_dissector,
+			     void *target_container)
 {
 	struct ip_tunnel_info *info;
 	struct ip_tunnel_key *key;
@@ -212,6 +212,7 @@ __skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 		tp->dst = key->tp_dst;
 	}
 }
+EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
 
 static enum flow_dissect_ret
 __skb_flow_dissect_mpls(const struct sk_buff *skb,
@@ -576,9 +577,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
-	__skb_flow_dissect_tunnel_info(skb, flow_dissector,
-				       target_container);
-
 	if (dissector_uses_key(flow_dissector,
 			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
 		struct ethhdr *eth = eth_hdr(skb);
-- 
cgit 


From 6c148184b5c868ad2c8a5a4a777cd8097622368a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:54:06 -0800
Subject: net: sched: cleanup qdisc_run and __qdisc_run semantics

Currently __qdisc_run calls qdisc_run_end() but does not call
qdisc_run_begin(). This makes it hard to track pairs of
qdisc_run_{begin,end} across function calls.

To simplify reading these code paths this patch moves begin/end calls
into qdisc_run().

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6bea8931bb62..44c7de365f55 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3192,9 +3192,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				contended = false;
 			}
 			__qdisc_run(q);
-		} else
-			qdisc_run_end(q);
+		}
 
+		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
@@ -3204,6 +3204,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				contended = false;
 			}
 			__qdisc_run(q);
+			qdisc_run_end(q);
 		}
 	}
 	spin_unlock(root_lock);
-- 
cgit 


From 6b3ba9146fe64b9bebb6346c9dcfe3b4851de2d7 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:54:25 -0800
Subject: net: sched: allow qdiscs to handle locking

This patch adds a flag for queueing disciplines to indicate the stack
does not need to use the qdisc lock to protect operations. This can
be used to build lockless scheduling algorithms and improving
performance.

The flag is checked in the tx path and the qdisc lock is only taken
if it is not set. For now use a conditional if statement. Later we
could be more aggressive if it proves worthwhile and use a static key
or wrap this in a likely().

Also the lockless case drops the TCQ_F_CAN_BYPASS logic. The reason
for this is synchronizing a qlen counter across threads proves to
cost more than doing the enqueue/dequeue operations when tested with
pktgen.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 44c7de365f55..e32cf5c7f200 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3162,6 +3162,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	int rc;
 
 	qdisc_calculate_pkt_len(skb, q);
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+			__qdisc_drop(skb, &to_free);
+			rc = NET_XMIT_DROP;
+		} else {
+			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+			__qdisc_run(q);
+		}
+
+		if (unlikely(to_free))
+			kfree_skb_list(to_free);
+		return rc;
+	}
+
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
@@ -4144,19 +4159,22 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 
 		while (head) {
 			struct Qdisc *q = head;
-			spinlock_t *root_lock;
+			spinlock_t *root_lock = NULL;
 
 			head = head->next_sched;
 
-			root_lock = qdisc_lock(q);
-			spin_lock(root_lock);
+			if (!(q->flags & TCQ_F_NOLOCK)) {
+				root_lock = qdisc_lock(q);
+				spin_lock(root_lock);
+			}
 			/* We need to make sure head->next_sched is read
 			 * before clearing __QDISC_STATE_SCHED
 			 */
 			smp_mb__before_atomic();
 			clear_bit(__QDISC_STATE_SCHED, &q->state);
 			qdisc_run(q);
-			spin_unlock(root_lock);
+			if (root_lock)
+				spin_unlock(root_lock);
 		}
 	}
 }
-- 
cgit 


From b01ac095c740fc21f4bb21abe900b0f5b3042cf9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 7 Dec 2017 09:57:20 -0800
Subject: net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq

The sch_mq qdisc creates a sub-qdisc per tx queue which are then
called independently for enqueue and dequeue operations. However
statistics are aggregated and pushed up to the "master" qdisc.

This patch adds support for any of the sub-qdiscs to be per cpu
statistic qdiscs. To handle this case add a check when calculating
stats and aggregate the per cpu stats if needed.

Also exports __gnet_stats_copy_queue() to use as a helper function.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/gen_stats.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 87f28557b329..b2b2323bdc84 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -252,10 +252,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
 	}
 }
 
-static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
-				    const struct gnet_stats_queue __percpu *cpu,
-				    const struct gnet_stats_queue *q,
-				    __u32 qlen)
+void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+			     const struct gnet_stats_queue __percpu *cpu,
+			     const struct gnet_stats_queue *q,
+			     __u32 qlen)
 {
 	if (cpu) {
 		__gnet_stats_copy_queue_cpu(qstats, cpu);
@@ -269,6 +269,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
 
 	qstats->qlen = qlen;
 }
+EXPORT_SYMBOL(__gnet_stats_copy_queue);
 
 /**
  * gnet_stats_copy_queue - copy queue statistics into statistics TLV
-- 
cgit 


From 46e6b992c2502b094e61da6994f1363f3b7c1413 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Thu, 7 Dec 2017 15:40:19 -0800
Subject: rtnetlink: allow GSO maximums to be set on device creation

Netlink device already allows changing GSO sizes with
ip set command. The part that is missing is allowing overriding
GSO settings on device creation.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a4faefd65006..412ebf0b09c6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1637,6 +1637,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PROMISCUITY]	= { .type = NLA_U32 },
 	[IFLA_NUM_TX_QUEUES]	= { .type = NLA_U32 },
 	[IFLA_NUM_RX_QUEUES]	= { .type = NLA_U32 },
+	[IFLA_GSO_MAX_SEGS]	= { .type = NLA_U32 },
+	[IFLA_GSO_MAX_SIZE]	= { .type = NLA_U32 },
 	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
 	[IFLA_CARRIER_CHANGES]	= { .type = NLA_U32 },  /* ignored */
 	[IFLA_PHYS_SWITCH_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
@@ -2287,6 +2289,34 @@ static int do_setlink(const struct sk_buff *skb,
 		}
 	}
 
+	if (tb[IFLA_GSO_MAX_SIZE]) {
+		u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
+
+		if (max_size > GSO_MAX_SIZE) {
+			err = -EINVAL;
+			goto errout;
+		}
+
+		if (dev->gso_max_size ^ max_size) {
+			netif_set_gso_max_size(dev, max_size);
+			status |= DO_SETLINK_MODIFIED;
+		}
+	}
+
+	if (tb[IFLA_GSO_MAX_SEGS]) {
+		u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+
+		if (max_segs > GSO_MAX_SEGS) {
+			err = -EINVAL;
+			goto errout;
+		}
+
+		if (dev->gso_max_segs ^ max_segs) {
+			dev->gso_max_segs = max_segs;
+			status |= DO_SETLINK_MODIFIED;
+		}
+	}
+
 	if (tb[IFLA_OPERSTATE])
 		set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
 
@@ -2651,6 +2681,10 @@ struct net_device *rtnl_create_link(struct net *net,
 		dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
 	if (tb[IFLA_GROUP])
 		dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+	if (tb[IFLA_GSO_MAX_SIZE])
+		netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
+	if (tb[IFLA_GSO_MAX_SEGS])
+		dev->gso_max_size = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
 
 	return dev;
 }
-- 
cgit 


From a0b586fa75a69578ecf10b40582eed9b35de2432 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 8 Dec 2017 15:34:13 -0800
Subject: rtnetlink: fix typo in GSO max segments

Fixes: 46e6b992c250 ("rtnetlink: allow GSO maximums to be set on device creation")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 412ebf0b09c6..c688dc564b11 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2684,7 +2684,7 @@ struct net_device *rtnl_create_link(struct net *net,
 	if (tb[IFLA_GSO_MAX_SIZE])
 		netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
 	if (tb[IFLA_GSO_MAX_SEGS])
-		dev->gso_max_size = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+		dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
 
 	return dev;
 }
-- 
cgit 


From 8d74e9f88d65af8bb2e095aff506aa6eac755ada Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 12 Dec 2017 11:39:04 -0500
Subject: net: avoid skb_warn_bad_offload on IS_ERR

skb_warn_bad_offload warns when packets enter the GSO stack that
require skb_checksum_help or vice versa. Do not warn on arbitrary
bad packets. Packet sockets can craft many. Syzkaller was able to
demonstrate another one with eth_type games.

In particular, suppress the warning when segmentation returns an
error, which is for reasons other than checksum offload.

See also commit 36c92474498a ("net: WARN if skb_checksum_help() is
called on skb requiring segmentation") for context on this warning.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8aa2f70995e8..b0eee49a2489 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2803,7 +2803,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 
 	segs = skb_mac_gso_segment(skb, features);
 
-	if (unlikely(skb_needs_check(skb, tx_path)))
+	if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 		skb_warn_bad_offload(skb);
 
 	return segs;
-- 
cgit 


From c060bc6115b6a204cb60e6b03fe64135731bc6c8 Mon Sep 17 00:00:00 2001
From: Xiongwei Song <sxwjean@gmail.com>
Date: Tue, 19 Dec 2017 07:17:15 +0800
Subject: bpf: make function xdp_do_generic_redirect_map() static

The function xdp_do_generic_redirect_map() is only used in this file, so
make it static.

Clean up sparse warning:
net/core/filter.c:2687:5: warning: no previous prototype
for 'xdp_do_generic_redirect_map' [-Wmissing-prototypes]

Signed-off-by: Xiongwei Song <sxwjean@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 754abe1041b7..130b842c3a15 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2684,8 +2684,9 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
 	return 0;
 }
 
-int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb,
-				struct bpf_prog *xdp_prog)
+static int xdp_do_generic_redirect_map(struct net_device *dev,
+				       struct sk_buff *skb,
+				       struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
-- 
cgit 


From 08fc7f8140730d2f8499c91b5abad44581b74635 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Thu, 14 Dec 2017 05:51:57 -0800
Subject: sock: Change the netns_core member name.

Change the member name will make the code more readable.
This patch will be used in next patch.

Signed-off-by: Martin Zhang <zhangjunweimartin@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index c0b5b2f17412..c2dd2d339db7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3045,7 +3045,7 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
 
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
+	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
@@ -3055,7 +3055,7 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
 	int res = 0;
 
 	for_each_possible_cpu(cpu)
-		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
 
 	return res >= 0 ? res : 0;
 }
@@ -3063,13 +3063,13 @@ EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 
 static int __net_init sock_inuse_init_net(struct net *net)
 {
-	net->core.inuse = alloc_percpu(struct prot_inuse);
-	return net->core.inuse ? 0 : -ENOMEM;
+	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
+	return net->core.prot_inuse ? 0 : -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
-	free_percpu(net->core.inuse);
+	free_percpu(net->core.prot_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
-- 
cgit 


From 648845ab7e200993dccd3948c719c858368c91e7 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Thu, 14 Dec 2017 05:51:58 -0800
Subject: sock: Move the socket inuse to namespace.

In some case, we want to know how many sockets are in use in
different _net_ namespaces. It's a key resource metric.

This patch add a member in struct netns_core. This is a counter
for socket-inuse in the _net_ namespace. The patch will add/sub
counter in the sk_alloc, sk_clone_lock and __sk_free.

This patch will not counter the socket created in kernel.
It's not very useful for userspace to know how many kernel
sockets we created.

The main reasons for doing this are that:

1. When linux calls the 'do_exit' for process to exit, the functions
'exit_task_namespaces' and 'exit_task_work' will be called sequentially.
'exit_task_namespaces' may have destroyed the _net_ namespace, but
'sock_release' called in 'exit_task_work' may use the _net_ namespace
if we counter the socket-inuse in sock_release.

2. socket and sock are in pair. More important, sock holds the _net_
namespace. We counter the socket-inuse in sock, for avoiding holding
_net_ namespace again in socket. It's a easy way to maintain the code.

Signed-off-by: Martin Zhang <zhangjunweimartin@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index c2dd2d339db7..72d14b221784 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -145,6 +145,8 @@
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
+static void sock_inuse_add(struct net *net, int val);
+
 /**
  * sk_ns_capable - General socket capability test
  * @sk: Socket to use a capability on or through
@@ -1531,8 +1533,11 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		sk->sk_kern_sock = kern;
 		sock_lock_init(sk);
 		sk->sk_net_refcnt = kern ? 0 : 1;
-		if (likely(sk->sk_net_refcnt))
+		if (likely(sk->sk_net_refcnt)) {
 			get_net(net);
+			sock_inuse_add(net, 1);
+		}
+
 		sock_net_set(sk, net);
 		refcount_set(&sk->sk_wmem_alloc, 1);
 
@@ -1595,6 +1600,9 @@ void sk_destruct(struct sock *sk)
 
 static void __sk_free(struct sock *sk)
 {
+	if (likely(sk->sk_net_refcnt))
+		sock_inuse_add(sock_net(sk), -1);
+
 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
 		sock_diag_broadcast_destroy(sk);
 	else
@@ -1716,6 +1724,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_priority = 0;
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
+		if (likely(newsk->sk_net_refcnt))
+			sock_inuse_add(sock_net(newsk), 1);
 
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
@@ -3061,15 +3071,44 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 
+static void sock_inuse_add(struct net *net, int val)
+{
+	this_cpu_add(*net->core.sock_inuse, val);
+}
+
+int sock_inuse_get(struct net *net)
+{
+	int cpu, res = 0;
+
+	for_each_possible_cpu(cpu)
+		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+
+	return res;
+}
+
+EXPORT_SYMBOL_GPL(sock_inuse_get);
+
 static int __net_init sock_inuse_init_net(struct net *net)
 {
 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
-	return net->core.prot_inuse ? 0 : -ENOMEM;
+	if (net->core.prot_inuse == NULL)
+		return -ENOMEM;
+
+	net->core.sock_inuse = alloc_percpu(int);
+	if (net->core.sock_inuse == NULL)
+		goto out;
+
+	return 0;
+
+out:
+	free_percpu(net->core.prot_inuse);
+	return -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
 	free_percpu(net->core.prot_inuse);
+	free_percpu(net->core.sock_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
@@ -3112,6 +3151,10 @@ static inline void assign_proto_idx(struct proto *prot)
 static inline void release_proto_idx(struct proto *prot)
 {
 }
+
+static void sock_inuse_add(struct net *net, int val)
+{
+}
 #endif
 
 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
-- 
cgit 


From fb1f5f79ae96331a0201b4080d34f3bc3b5c0b1d Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Sat, 16 Dec 2017 03:09:40 -0500
Subject: net: Introduce NETIF_F_GRO_HW.

Introduce NETIF_F_GRO_HW feature flag for NICs that support hardware
GRO.  With this flag, we can now independently turn on or off hardware
GRO when GRO is on.  Previously, drivers were using NETIF_F_GRO to
control hardware GRO and so it cannot be independently turned on or
off without affecting GRO.

Hardware GRO (just like GRO) guarantees that packets can be re-segmented
by TSO/GSO to reconstruct the original packet stream.  Logically,
GRO_HW should depend on GRO since it a subset, but we will let
individual drivers enforce this dependency as they see fit.

Since NETIF_F_GRO is not propagated between upper and lower devices,
NETIF_F_GRO_HW should follow suit since it is a subset of GRO.  In other
words, a lower device can independent have GRO/GRO_HW enabled or disabled
and no feature propagation is required.  This will preserve the current
GRO behavior.  This can be changed later if we decide to propagate GRO/
GRO_HW/RXCSUM from upper to lower devices.

Cc: Ariel Elior <Ariel.Elior@cavium.com>
Cc: everest-linux-l2@cavium.com
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c     | 12 ++++++++++++
 net/core/ethtool.c |  1 +
 2 files changed, 13 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index b0eee49a2489..4b43f5dcabcd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7424,6 +7424,18 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		features &= ~dev->gso_partial_features;
 	}
 
+	if (!(features & NETIF_F_RXCSUM)) {
+		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
+		 * successfully merged by hardware must also have the
+		 * checksum verified by hardware.  If the user does not
+		 * want to enable RXCSUM, logically, we should disable GRO_HW.
+		 */
+		if (features & NETIF_F_GRO_HW) {
+			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
+			features &= ~NETIF_F_GRO_HW;
+		}
+	}
+
 	return features;
 }
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f8fcf450a36e..50a79203043b 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -73,6 +73,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_LLTX_BIT] =             "tx-lockless",
 	[NETIF_F_NETNS_LOCAL_BIT] =      "netns-local",
 	[NETIF_F_GRO_BIT] =              "rx-gro",
+	[NETIF_F_GRO_HW_BIT] =           "rx-gro-hw",
 	[NETIF_F_LRO_BIT] =              "rx-lro",
 
 	[NETIF_F_TSO_BIT] =              "tx-tcp-segmentation",
-- 
cgit 


From 56f5aa77cdad1076bea0ae8ddeb74ba68ddc9502 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Sat, 16 Dec 2017 03:09:41 -0500
Subject: net: Disable GRO_HW when generic XDP is installed on a device.

Hardware should not aggregate any packets when generic XDP is installed.

Cc: Ariel Elior <Ariel.Elior@cavium.com>
Cc: everest-linux-l2@cavium.com
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4b43f5dcabcd..c7db39926769 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1542,6 +1542,23 @@ void dev_disable_lro(struct net_device *dev)
 }
 EXPORT_SYMBOL(dev_disable_lro);
 
+/**
+ *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
+ *	@dev: device
+ *
+ *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
+ *	called under RTNL.  This is needed if Generic XDP is installed on
+ *	the device.
+ */
+static void dev_disable_gro_hw(struct net_device *dev)
+{
+	dev->wanted_features &= ~NETIF_F_GRO_HW;
+	netdev_update_features(dev);
+
+	if (unlikely(dev->features & NETIF_F_GRO_HW))
+		netdev_WARN(dev, "failed to disable GRO_HW!\n");
+}
+
 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 				   struct net_device *dev)
 {
@@ -4564,6 +4581,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 		} else if (new && !old) {
 			static_key_slow_inc(&generic_xdp_needed);
 			dev_disable_lro(dev);
+			dev_disable_gro_hw(dev);
 		}
 		break;
 
-- 
cgit 


From 3dca3f38cfb8efb8571040568cac7d0025fa5bb1 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 20 Dec 2017 10:41:31 +0100
Subject: xfrm: Separate ESP handling from segmentation for GRO packets.

We change the ESP GSO handlers to only segment the packets.
The ESP handling and encryption is defered to validate_xmit_xfrm()
where this is done for non GRO packets too. This makes the code
more robust and prepares for asynchronous crypto handling.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/core/dev.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index c7db39926769..fb7a24a373d1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3083,9 +3083,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 		    __skb_linearize(skb))
 			goto out_kfree_skb;
 
-		if (validate_xmit_xfrm(skb, features))
-			goto out_kfree_skb;
-
 		/* If packet is not checksummed and device does not
 		 * support checksumming for this protocol, complete
 		 * checksumming here.
@@ -3102,6 +3099,8 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 		}
 	}
 
+	skb = validate_xmit_xfrm(skb, features);
+
 	return skb;
 
 out_kfree_skb:
-- 
cgit 


From f53c723902d1ac5f0b0a11d7c9dcbff748dde74e Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 20 Dec 2017 10:41:36 +0100
Subject: net: Add asynchronous callbacks for xfrm on layer 2.

This patch implements asynchronous crypto callbacks
and a backlog handler that can be used when IPsec
is done at layer 2 in the TX path. It also extends
the skb validate functions so that we can update
the driver transmit return codes based on async
crypto operation or to indicate that we queued the
packet in a backlog queue.

Joint work with: Aviv Heller <avivh@mellanox.com>

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/core/dev.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index fb7a24a373d1..821dd8cb7169 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3059,7 +3059,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(skb_csum_hwoffload_help);
 
-static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 {
 	netdev_features_t features;
 
@@ -3099,7 +3099,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 		}
 	}
 
-	skb = validate_xmit_xfrm(skb, features);
+	skb = validate_xmit_xfrm(skb, features, again);
 
 	return skb;
 
@@ -3110,7 +3110,7 @@ out_null:
 	return NULL;
 }
 
-struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 {
 	struct sk_buff *next, *head = NULL, *tail;
 
@@ -3121,7 +3121,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
 		/* in case skb wont be segmented, point to itself */
 		skb->prev = skb;
 
-		skb = validate_xmit_skb(skb, dev);
+		skb = validate_xmit_skb(skb, dev, again);
 		if (!skb)
 			continue;
 
@@ -3448,6 +3448,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 	struct netdev_queue *txq;
 	struct Qdisc *q;
 	int rc = -ENOMEM;
+	bool again = false;
 
 	skb_reset_mac_header(skb);
 
@@ -3509,7 +3510,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 				     XMIT_RECURSION_LIMIT))
 				goto recursion_alert;
 
-			skb = validate_xmit_skb(skb, dev);
+			skb = validate_xmit_skb(skb, dev, &again);
 			if (!skb)
 				goto out;
 
@@ -4193,6 +4194,8 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 				spin_unlock(root_lock);
 		}
 	}
+
+	xfrm_dev_backlog(sd);
 }
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
@@ -8874,6 +8877,9 @@ static int __init net_dev_init(void)
 
 		skb_queue_head_init(&sd->input_pkt_queue);
 		skb_queue_head_init(&sd->process_queue);
+#ifdef CONFIG_XFRM_OFFLOAD
+		skb_queue_head_init(&sd->xfrm_backlog);
+#endif
 		INIT_LIST_HEAD(&sd->poll_list);
 		sd->output_queue_tailp = &sd->output_queue;
 #ifdef CONFIG_RPS
-- 
cgit 


From 5b0890a97204627d75a333fc30f29f737e2bfad6 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Thu, 21 Dec 2017 10:17:42 +0100
Subject: flow_dissector: Parse batman-adv unicast headers

The batman-adv unicast packets contain a full layer 2 frame in encapsulated
form. The flow dissector must therefore be able to parse the batman-adv
unicast header to reach the layer 2+3 information.

  +--------------------+
  | ip(v6)hdr          |
  +--------------------+
  | inner ethhdr       |
  +--------------------+
  | batadv unicast hdr |
  +--------------------+
  | outer ethhdr       |
  +--------------------+

The obtained information from the upper layer can then be used by RPS to
schedule the processing on separate cores. This allows better distribution
of multiple flows from the same neighbor to different cores.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index cc75488d3653..02db7b122a73 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -24,6 +24,7 @@
 #include <linux/tcp.h>
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
+#include <uapi/linux/batadv_packet.h>
 
 static void dissector_set_key(struct flow_dissector *flow_dissector,
 			      enum flow_dissector_key_id key_id)
@@ -437,6 +438,57 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
 	return FLOW_DISSECT_RET_PROTO_AGAIN;
 }
 
+/**
+ * __skb_flow_dissect_batadv() - dissect batman-adv header
+ * @skb: sk_buff to with the batman-adv header
+ * @key_control: flow dissectors control key
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @p_proto: pointer used to update the protocol to process next
+ * @p_nhoff: pointer used to update inner network header offset
+ * @hlen: packet header length
+ * @flags: any combination of FLOW_DISSECTOR_F_*
+ *
+ * ETH_P_BATMAN packets are tried to be dissected. Only
+ * &struct batadv_unicast packets are actually processed because they contain an
+ * inner ethernet header and are usually followed by actual network header. This
+ * allows the flow dissector to continue processing the packet.
+ *
+ * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found,
+ *  FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation,
+ *  otherwise FLOW_DISSECT_RET_OUT_BAD
+ */
+static enum flow_dissect_ret
+__skb_flow_dissect_batadv(const struct sk_buff *skb,
+			  struct flow_dissector_key_control *key_control,
+			  void *data, __be16 *p_proto, int *p_nhoff, int hlen,
+			  unsigned int flags)
+{
+	struct {
+		struct batadv_unicast_packet batadv_unicast;
+		struct ethhdr eth;
+	} *hdr, _hdr;
+
+	hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen,
+				   &_hdr);
+	if (!hdr)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	if (hdr->batadv_unicast.packet_type != BATADV_UNICAST)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	*p_proto = hdr->eth.h_proto;
+	*p_nhoff += sizeof(*hdr);
+
+	key_control->flags |= FLOW_DIS_ENCAPSULATION;
+	if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+		return FLOW_DISSECT_RET_OUT_GOOD;
+
+	return FLOW_DISSECT_RET_PROTO_AGAIN;
+}
+
 static void
 __skb_flow_dissect_tcp(const struct sk_buff *skb,
 		       struct flow_dissector *flow_dissector,
@@ -815,6 +867,11 @@ proto_again:
 					       nhoff, hlen);
 		break;
 
+	case htons(ETH_P_BATMAN):
+		fdret = __skb_flow_dissect_batadv(skb, key_control, data,
+						  &proto, &nhoff, hlen, flags);
+		break;
+
 	default:
 		fdret = FLOW_DISSECT_RET_OUT_BAD;
 		break;
-- 
cgit 


From bf5c25d608613eaf4dcdba5a9cac5b2afe67d635 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Dec 2017 19:00:17 -0500
Subject: skbuff: in skb_segment, call zerocopy functions once per nskb

This is a net-next follow-up to commit 268b79067942 ("skbuff: orphan
frags before zerocopy clone"), which fixed a bug in net, but added a
call to skb_zerocopy_clone at each frag to do so.

When segmenting skbs with user frags, either the user frags must be
replaced with private copies and uarg released, or the uarg must have
its refcount increased for each new skb.

skb_orphan_frags does the first, except for cases that can handle
reference counting. skb_zerocopy_clone then does the second.

Call these once per nskb, instead of once per frag.

That is, in the common case. With a frag list, also refresh when the
origin skb (frag_skb) changes.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a3cb0be4c6f3..00b0757830e2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3656,6 +3656,10 @@ normal:
 		skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
 					      SKBTX_SHARED_FRAG;
 
+		if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
+		    skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
+			goto err;
+
 		while (pos < offset + len) {
 			if (i >= nfrags) {
 				BUG_ON(skb_headlen(list_skb));
@@ -3667,6 +3671,11 @@ normal:
 
 				BUG_ON(!nfrags);
 
+				if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
+				    skb_zerocopy_clone(nskb, frag_skb,
+						       GFP_ATOMIC))
+					goto err;
+
 				list_skb = list_skb->next;
 			}
 
@@ -3678,11 +3687,6 @@ normal:
 				goto err;
 			}
 
-			if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
-				goto err;
-			if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
-				goto err;
-
 			*nskb_frag = *frag;
 			__skb_frag_ref(nskb_frag);
 			size = skb_frag_size(nskb_frag);
-- 
cgit 


From aecd67b60722dd24353b0bc50e78a55b30707dcd Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:13 +0100
Subject: xdp: base API for new XDP rx-queue info concept

This patch only introduce the core data structures and API functions.
All XDP enabled drivers must use the API before this info can used.

There is a need for XDP to know more about the RX-queue a given XDP
frames have arrived on.  For both the XDP bpf-prog and kernel side.

Instead of extending xdp_buff each time new info is needed, the patch
creates a separate read-mostly struct xdp_rxq_info, that contains this
info.  We stress this data/cache-line is for read-only info.  This is
NOT for dynamic per packet info, use the data_meta for such use-cases.

The performance advantage is this info can be setup at RX-ring init
time, instead of updating N-members in xdp_buff.  A possible (driver
level) micro optimization is that xdp_buff->rxq assignment could be
done once per XDP/NAPI loop.  The extra pointer deref only happens for
program needing access to this info (thus, no slowdown to existing
use-cases).

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/Makefile |  2 +-
 net/core/xdp.c    | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 net/core/xdp.c

(limited to 'net/core')

diff --git a/net/core/Makefile b/net/core/Makefile
index 1fd0a9c88b1b..6dbbba8c57ae 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
 			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
-			fib_notifier.o
+			fib_notifier.o xdp.o
 
 obj-y += net-sysfs.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
diff --git a/net/core/xdp.c b/net/core/xdp.c
new file mode 100644
index 000000000000..229bc5a0ee04
--- /dev/null
+++ b/net/core/xdp.c
@@ -0,0 +1,67 @@
+/* net/core/xdp.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+#include <linux/types.h>
+#include <linux/mm.h>
+
+#include <net/xdp.h>
+
+#define REG_STATE_NEW		0x0
+#define REG_STATE_REGISTERED	0x1
+#define REG_STATE_UNREGISTERED	0x2
+#define REG_STATE_UNUSED	0x3
+
+void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
+{
+	/* Simplify driver cleanup code paths, allow unreg "unused" */
+	if (xdp_rxq->reg_state == REG_STATE_UNUSED)
+		return;
+
+	WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
+
+	xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
+	xdp_rxq->dev = NULL;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
+
+static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
+{
+	memset(xdp_rxq, 0, sizeof(*xdp_rxq));
+}
+
+/* Returns 0 on success, negative on failure */
+int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+		     struct net_device *dev, u32 queue_index)
+{
+	if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
+		WARN(1, "Driver promised not to register this");
+		return -EINVAL;
+	}
+
+	if (xdp_rxq->reg_state == REG_STATE_REGISTERED) {
+		WARN(1, "Missing unregister, handled but fix driver");
+		xdp_rxq_info_unreg(xdp_rxq);
+	}
+
+	if (!dev) {
+		WARN(1, "Missing net_device from driver");
+		return -ENODEV;
+	}
+
+	/* State either UNREGISTERED or NEW */
+	xdp_rxq_info_init(xdp_rxq);
+	xdp_rxq->dev = dev;
+	xdp_rxq->queue_index = queue_index;
+
+	xdp_rxq->reg_state = REG_STATE_REGISTERED;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
+
+void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
+{
+	xdp_rxq->reg_state = REG_STATE_UNUSED;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);
-- 
cgit 


From c0124f327e5cabd844a10d7e1fc5aa2a81e796a9 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:34 +0100
Subject: xdp/qede: setup xdp_rxq_info and intro xdp_rxq_info_is_reg

The driver code qede_free_fp_array() depend on kfree() can be called
with a NULL pointer. This stems from the qede_alloc_fp_array()
function which either (kz)alloc memory for fp->txq or fp->rxq.
This also simplifies error handling code in case of memory allocation
failures, but xdp_rxq_info_unreg need to know the difference.

Introduce xdp_rxq_info_is_reg() to handle if a memory allocation fails
and detect this is the failure path by seeing that xdp_rxq_info was
not registred yet, which first happens after successful alloaction in
qede_init_fp().

Driver hook points for xdp_rxq_info:
 * reg  : qede_init_fp
 * unreg: qede_free_fp_array

Tested on actual hardware with samples/bpf program.

V2: Driver have no proper error path for failed XDP RX-queue info reg, as
qede_init_fp() is a void function.

Cc: everest-linux-l2@cavium.com
Cc: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/xdp.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net/core')

diff --git a/net/core/xdp.c b/net/core/xdp.c
index 229bc5a0ee04..097a0f74e004 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -65,3 +65,9 @@ void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
 	xdp_rxq->reg_state = REG_STATE_UNUSED;
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);
+
+bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
+{
+	return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
-- 
cgit 


From e817f85652c14d78f170b18797e4c477c78949e0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:26:09 +0100
Subject: xdp: generic XDP handling of xdp_rxq_info

Hook points for xdp_rxq_info:
 * reg  : netif_alloc_rx_queues
 * unreg: netif_free_rx_queues

The net_device have some members (num_rx_queues + real_num_rx_queues)
and data-area (dev->_rx with struct netdev_rx_queue's) that were
primarily used for exporting information about RPS (CONFIG_RPS) queues
to sysfs (CONFIG_SYSFS).

For generic XDP extend struct netdev_rx_queue with the xdp_rxq_info,
and remove some of the CONFIG_SYSFS ifdefs.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/dev.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 59 insertions(+), 10 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 2eb66c0d9cdb..d7925ef8743d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3906,9 +3906,33 @@ drop:
 	return NET_RX_DROP;
 }
 
+static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct netdev_rx_queue *rxqueue;
+
+	rxqueue = dev->_rx;
+
+	if (skb_rx_queue_recorded(skb)) {
+		u16 index = skb_get_rx_queue(skb);
+
+		if (unlikely(index >= dev->real_num_rx_queues)) {
+			WARN_ONCE(dev->real_num_rx_queues > 1,
+				  "%s received packet on queue %u, but number "
+				  "of RX queues is %u\n",
+				  dev->name, index, dev->real_num_rx_queues);
+
+			return rxqueue; /* Return first rxqueue */
+		}
+		rxqueue += index;
+	}
+	return rxqueue;
+}
+
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
+	struct netdev_rx_queue *rxqueue;
 	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
 	void *orig_data;
@@ -3952,6 +3976,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
 	orig_data = xdp.data;
 
+	rxqueue = netif_get_rxqueue(skb);
+	xdp.rxq = &rxqueue->xdp_rxq;
+
 	act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
 	off = xdp.data - orig_data;
@@ -7589,12 +7616,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 }
 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 
-#ifdef CONFIG_SYSFS
 static int netif_alloc_rx_queues(struct net_device *dev)
 {
 	unsigned int i, count = dev->num_rx_queues;
 	struct netdev_rx_queue *rx;
 	size_t sz = count * sizeof(*rx);
+	int err = 0;
 
 	BUG_ON(count < 1);
 
@@ -7604,11 +7631,39 @@ static int netif_alloc_rx_queues(struct net_device *dev)
 
 	dev->_rx = rx;
 
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
 		rx[i].dev = dev;
+
+		/* XDP RX-queue setup */
+		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
+		if (err < 0)
+			goto err_rxq_info;
+	}
 	return 0;
+
+err_rxq_info:
+	/* Rollback successful reg's and free other resources */
+	while (i--)
+		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
+	kfree(dev->_rx);
+	dev->_rx = NULL;
+	return err;
+}
+
+static void netif_free_rx_queues(struct net_device *dev)
+{
+	unsigned int i, count = dev->num_rx_queues;
+	struct netdev_rx_queue *rx;
+
+	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
+	if (!dev->_rx)
+		return;
+
+	rx = dev->_rx;
+
+	for (i = 0; i < count; i++)
+		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 }
-#endif
 
 static void netdev_init_one_queue(struct net_device *dev,
 				  struct netdev_queue *queue, void *_unused)
@@ -8169,12 +8224,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
-#ifdef CONFIG_SYSFS
 	if (rxqs < 1) {
 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
 		return NULL;
 	}
-#endif
 
 	alloc_size = sizeof(struct net_device);
 	if (sizeof_priv) {
@@ -8231,12 +8284,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (netif_alloc_netdev_queues(dev))
 		goto free_all;
 
-#ifdef CONFIG_SYSFS
 	dev->num_rx_queues = rxqs;
 	dev->real_num_rx_queues = rxqs;
 	if (netif_alloc_rx_queues(dev))
 		goto free_all;
-#endif
 
 	strcpy(dev->name, name);
 	dev->name_assign_type = name_assign_type;
@@ -8275,9 +8326,7 @@ void free_netdev(struct net_device *dev)
 
 	might_sleep();
 	netif_free_tx_queues(dev);
-#ifdef CONFIG_SYSFS
-	kvfree(dev->_rx);
-#endif
+	netif_free_rx_queues(dev);
 
 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
 
-- 
cgit 


From 02dd3291b2f095bbc88e1d2628fd5bf2e92de69b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:26:14 +0100
Subject: bpf: finally expose xdp_rxq_info to XDP bpf-programs

Now all XDP driver have been updated to setup xdp_rxq_info and assign
this to xdp_buff->rxq.  Thus, it is now safe to enable access to some
of the xdp_rxq_info struct members.

This patch extend xdp_md and expose UAPI to userspace for
ingress_ifindex and rx_queue_index.  Access happens via bpf
instruction rewrite, that load data directly from struct xdp_rxq_info.

* ingress_ifindex map to xdp_rxq_info->dev->ifindex
* rx_queue_index  map to xdp_rxq_info->queue_index

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 130b842c3a15..acdb94c0e97f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4304,6 +4304,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data_end));
 		break;
+	case offsetof(struct xdp_md, ingress_ifindex):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, rxq));
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
+				      si->dst_reg, si->dst_reg,
+				      offsetof(struct xdp_rxq_info, dev));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct net_device,
+						     ifindex, 4, target_size));
+		break;
+	case offsetof(struct xdp_md, rx_queue_index):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, rxq));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct xdp_rxq_info,
+						queue_index, 4, target_size));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit 


From c5a9f6f0ab4054082dd5ce9bbdaa8e8ff05cf365 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Mon, 17 Jul 2017 13:47:07 +0300
Subject: net/core: Add drop counters to VF statistics

Modern hardware can decide to drop packets going to/from a VF.
Add receive and transmit drop counters to be displayed at hypervisor
layer in iproute2 per VF statistics.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 net/core/rtnetlink.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c688dc564b11..5421a3fd3ba1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -904,6 +904,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 			 nla_total_size_64bit(sizeof(__u64)) +
 			 /* IFLA_VF_STATS_MULTICAST */
 			 nla_total_size_64bit(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_RX_DROPPED */
+			 nla_total_size_64bit(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_TX_DROPPED */
+			 nla_total_size_64bit(sizeof(__u64)) +
 			 nla_total_size(sizeof(struct ifla_vf_trust)));
 		return size;
 	} else
@@ -1258,7 +1262,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
 			      vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
-			      vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+			      vf_stats.multicast, IFLA_VF_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
+			      vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
+			      vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
 		nla_nest_cancel(skb, vfstats);
 		goto nla_put_vf_failure;
 	}
-- 
cgit 


From 37e2d99b59c4765112533a1d38174fea58d28a51 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Mon, 8 Jan 2018 16:00:24 +0200
Subject: ethtool: Ensure new ring parameters are within bounds during
 SRINGPARAM

Add a sanity check to ensure that all requested ring parameters
are within bounds, which should reduce errors in driver implementation.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index fff6314f4c5e..107b122c8969 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1693,14 +1693,23 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
 
 static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
 {
-	struct ethtool_ringparam ringparam;
+	struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM };
 
-	if (!dev->ethtool_ops->set_ringparam)
+	if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
 		return -EFAULT;
 
+	dev->ethtool_ops->get_ringparam(dev, &max);
+
+	/* ensure new ring parameters are within the maximums */
+	if (ringparam.rx_pending > max.rx_max_pending ||
+	    ringparam.rx_mini_pending > max.rx_mini_max_pending ||
+	    ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending ||
+	    ringparam.tx_pending > max.tx_max_pending)
+		return -EINVAL;
+
 	return dev->ethtool_ops->set_ringparam(dev, &ringparam);
 }
 
-- 
cgit 


From 141b52a98ab45a835ff1ea869414faccdc255a72 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 10 Jan 2018 01:20:01 -0800
Subject: net: use the right variant of kfree

kvzalloc'ed memory should be kvfree'd.

Fixes: e817f85652c1 ("xdp: generic XDP handling of xdp_rxq_info")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index d7925ef8743d..852a54c769a3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7645,7 +7645,7 @@ err_rxq_info:
 	/* Rollback successful reg's and free other resources */
 	while (i--)
 		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
-	kfree(dev->_rx);
+	kvfree(dev->_rx);
 	dev->_rx = NULL;
 	return err;
 }
-- 
cgit 


From 82aaff2f63443e1d6cc4a186ed9c2a5718123906 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 10 Jan 2018 01:20:02 -0800
Subject: net: free RX queue structures

Looks like commit e817f85652c1 ("xdp: generic XDP handling of
xdp_rxq_info") replaced kvfree(dev->_rx) in free_netdev() with
a call to netif_free_rx_queues() which doesn't actually free
the rings?

While at it remove the unnecessary temporary variable.

Fixes: e817f85652c1 ("xdp: generic XDP handling of xdp_rxq_info")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 852a54c769a3..74e1e5d31337 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7653,16 +7653,15 @@ err_rxq_info:
 static void netif_free_rx_queues(struct net_device *dev)
 {
 	unsigned int i, count = dev->num_rx_queues;
-	struct netdev_rx_queue *rx;
 
 	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 	if (!dev->_rx)
 		return;
 
-	rx = dev->_rx;
-
 	for (i = 0; i < count; i++)
-		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
+		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
+
+	kvfree(dev->_rx);
 }
 
 static void netdev_init_one_queue(struct net_device *dev,
-- 
cgit 


From d584527c70399cf0d095396d696029f54a10cfd3 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 22 Nov 2017 10:57:41 -0800
Subject: net: Cap number of queues even with accel_priv

With the recent fix to ixgbe we can cap the number of queues always
regardless of if accel_priv is being used or not since the actual number of
queues are being reported via real_num_tx_queues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 net/core/dev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 3d24d9a59086..94435cd09072 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3420,8 +3420,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 		else
 			queue_index = __netdev_pick_tx(dev, skb);
 
-		if (!accel_priv)
-			queue_index = netdev_cap_txqueue(dev, queue_index);
+		queue_index = netdev_cap_txqueue(dev, queue_index);
 	}
 
 	skb_set_queue_mapping(skb, queue_index);
-- 
cgit 


From daaf24c634ab951cad3dcef28492001ef9c931d0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 11 Jan 2018 17:39:09 +0100
Subject: bpf: simplify xdp_convert_ctx_access for xdp_rxq_info

As pointed out by Daniel Borkmann, using bpf_target_off() is not
necessary for xdp_rxq_info when extracting queue_index and
ifindex, as these members are u32 like BPF_W.

Also fix trivial spelling mistake introduced in same commit.

Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index d4b190e63b79..db2ee8c7e1bd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4310,16 +4310,15 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->dst_reg,
 				      offsetof(struct xdp_rxq_info, dev));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
-				      bpf_target_off(struct net_device,
-						     ifindex, 4, target_size));
+				      offsetof(struct net_device, ifindex));
 		break;
 	case offsetof(struct xdp_md, rx_queue_index):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, rxq));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
-				      bpf_target_off(struct xdp_rxq_info,
-						queue_index, 4, target_size));
+				      offsetof(struct xdp_rxq_info,
+					       queue_index));
 		break;
 	}
 
-- 
cgit 


From 273c28bc57ca9672f7b70bed764ecdfb964930c8 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 12 Jan 2018 18:28:31 +0300
Subject: net: Convert atomic_t net::count to refcount_t

Since net could be obtained from RCU lists,
and there is a race with net destruction,
the patch converts net::count to refcount_t.

This provides sanity checks for the cases of
incrementing counter of already dead net,
when maybe_get_net() has to used instead
of get_net().

Drivers: allyesconfig and allmodconfig are OK.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c     | 6 +++---
 net/core/net_namespace.c | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 799b75268291..7bf8b85ade16 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -961,7 +961,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
 	while (--i >= new_num) {
 		struct kobject *kobj = &dev->_rx[i].kobj;
 
-		if (!atomic_read(&dev_net(dev)->count))
+		if (!refcount_read(&dev_net(dev)->count))
 			kobj->uevent_suppress = 1;
 		if (dev->sysfs_rx_queue_group)
 			sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
@@ -1367,7 +1367,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
 	while (--i >= new_num) {
 		struct netdev_queue *queue = dev->_tx + i;
 
-		if (!atomic_read(&dev_net(dev)->count))
+		if (!refcount_read(&dev_net(dev)->count))
 			queue->kobj.uevent_suppress = 1;
 #ifdef CONFIG_BQL
 		sysfs_remove_group(&queue->kobj, &dql_group);
@@ -1558,7 +1558,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
 {
 	struct device *dev = &ndev->dev;
 
-	if (!atomic_read(&dev_net(ndev)->count))
+	if (!refcount_read(&dev_net(ndev)->count))
 		dev_set_uevent_suppress(dev, 1);
 
 	kobject_get(&dev->kobj);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 60a71be75aea..2213d45fcafd 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -35,7 +35,7 @@ LIST_HEAD(net_namespace_list);
 EXPORT_SYMBOL_GPL(net_namespace_list);
 
 struct net init_net = {
-	.count		= ATOMIC_INIT(1),
+	.count		= REFCOUNT_INIT(1),
 	.dev_base_head	= LIST_HEAD_INIT(init_net.dev_base_head),
 };
 EXPORT_SYMBOL(init_net);
@@ -224,10 +224,10 @@ int peernet2id_alloc(struct net *net, struct net *peer)
 	bool alloc;
 	int id;
 
-	if (atomic_read(&net->count) == 0)
+	if (refcount_read(&net->count) == 0)
 		return NETNSA_NSID_NOT_ASSIGNED;
 	spin_lock_bh(&net->nsid_lock);
-	alloc = atomic_read(&peer->count) == 0 ? false : true;
+	alloc = refcount_read(&peer->count) == 0 ? false : true;
 	id = __peernet2id_alloc(net, peer, &alloc);
 	spin_unlock_bh(&net->nsid_lock);
 	if (alloc && id >= 0)
@@ -284,7 +284,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	int error = 0;
 	LIST_HEAD(net_exit_list);
 
-	atomic_set(&net->count, 1);
+	refcount_set(&net->count, 1);
 	refcount_set(&net->passive, 1);
 	net->dev_base_seq = 1;
 	net->user_ns = user_ns;
-- 
cgit 


From 2406e7e546b223e8cf42c44ac7352d4d1fd1dbcd Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:02 +0100
Subject: devlink: Add per devlink instance lock

This is a preparation before introducing resources and hot reload support.
Currently there are two global lock where one protects all devlink access,
and the second one protects devlink port access. This patch adds per devlink
instance lock which protects the internal members which are the sb/dpipe/
resource/ports. By introducing this lock the global devlink port lock can
be discarded.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 136 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 77 insertions(+), 59 deletions(-)

(limited to 'net/core')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 7d430c1d9c3e..2f71734c4ff6 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -92,12 +92,6 @@ static LIST_HEAD(devlink_list);
  */
 static DEFINE_MUTEX(devlink_mutex);
 
-/* devlink_port_mutex
- *
- * Shared lock to guard lists of ports in all devlink devices.
- */
-static DEFINE_MUTEX(devlink_port_mutex);
-
 static struct net *devlink_net(const struct devlink *devlink)
 {
 	return read_pnet(&devlink->_net);
@@ -335,15 +329,18 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
 #define DEVLINK_NL_FLAG_NEED_DEVLINK	BIT(0)
 #define DEVLINK_NL_FLAG_NEED_PORT	BIT(1)
 #define DEVLINK_NL_FLAG_NEED_SB		BIT(2)
-#define DEVLINK_NL_FLAG_LOCK_PORTS	BIT(3)
-	/* port is not needed but we need to ensure they don't
-	 * change in the middle of command
-	 */
+
+/* The per devlink instance lock is taken by default in the pre-doit
+ * operation, yet several commands do not require this. The global
+ * devlink lock is taken and protects from disruption by user-calls.
+ */
+#define DEVLINK_NL_FLAG_NO_LOCK		BIT(3)
 
 static int devlink_nl_pre_doit(const struct genl_ops *ops,
 			       struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink *devlink;
+	int err;
 
 	mutex_lock(&devlink_mutex);
 	devlink = devlink_get_from_info(info);
@@ -351,44 +348,47 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
 		mutex_unlock(&devlink_mutex);
 		return PTR_ERR(devlink);
 	}
+	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+		mutex_lock(&devlink->lock);
 	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
 		info->user_ptr[0] = devlink;
 	} else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
 		struct devlink_port *devlink_port;
 
-		mutex_lock(&devlink_port_mutex);
 		devlink_port = devlink_port_get_from_info(devlink, info);
 		if (IS_ERR(devlink_port)) {
-			mutex_unlock(&devlink_port_mutex);
-			mutex_unlock(&devlink_mutex);
-			return PTR_ERR(devlink_port);
+			err = PTR_ERR(devlink_port);
+			goto unlock;
 		}
 		info->user_ptr[0] = devlink_port;
 	}
-	if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) {
-		mutex_lock(&devlink_port_mutex);
-	}
 	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) {
 		struct devlink_sb *devlink_sb;
 
 		devlink_sb = devlink_sb_get_from_info(devlink, info);
 		if (IS_ERR(devlink_sb)) {
-			if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
-				mutex_unlock(&devlink_port_mutex);
-			mutex_unlock(&devlink_mutex);
-			return PTR_ERR(devlink_sb);
+			err = PTR_ERR(devlink_sb);
+			goto unlock;
 		}
 		info->user_ptr[1] = devlink_sb;
 	}
 	return 0;
+
+unlock:
+	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+		mutex_unlock(&devlink->lock);
+	mutex_unlock(&devlink_mutex);
+	return err;
 }
 
 static void devlink_nl_post_doit(const struct genl_ops *ops,
 				 struct sk_buff *skb, struct genl_info *info)
 {
-	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT ||
-	    ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS)
-		mutex_unlock(&devlink_port_mutex);
+	struct devlink *devlink;
+
+	devlink = devlink_get_from_info(info);
+	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+		mutex_unlock(&devlink->lock);
 	mutex_unlock(&devlink_mutex);
 }
 
@@ -614,10 +614,10 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_port, &devlink->port_list, list) {
 			if (idx < start) {
 				idx++;
@@ -628,13 +628,15 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 						   NETLINK_CB(cb->skb).portid,
 						   cb->nlh->nlmsg_seq,
 						   NLM_F_MULTI);
-			if (err)
+			if (err) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 			idx++;
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -801,6 +803,7 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			if (idx < start) {
 				idx++;
@@ -811,10 +814,13 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 						 NETLINK_CB(cb->skb).portid,
 						 cb->nlh->nlmsg_seq,
 						 NLM_F_MULTI);
-			if (err)
+			if (err) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 			idx++;
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -935,14 +941,18 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_pool_get)
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
 						   devlink_sb,
 						   NETLINK_CB(cb->skb).portid,
 						   cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -1123,22 +1133,24 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_port_pool_get)
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_port_pool_get_dumpit(msg, start, &idx,
 							devlink, devlink_sb,
 							NETLINK_CB(cb->skb).portid,
 							cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -1347,23 +1359,26 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
 			continue;
+
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx,
 							   devlink,
 							   devlink_sb,
 							   NETLINK_CB(cb->skb).portid,
 							   cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -2322,14 +2337,16 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.doit = devlink_nl_cmd_port_split_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
 	},
 	{
 		.cmd = DEVLINK_CMD_PORT_UNSPLIT,
 		.doit = devlink_nl_cmd_port_unsplit_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
 	},
 	{
 		.cmd = DEVLINK_CMD_SB_GET,
@@ -2397,8 +2414,7 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
-				  DEVLINK_NL_FLAG_NEED_SB |
-				  DEVLINK_NL_FLAG_LOCK_PORTS,
+				  DEVLINK_NL_FLAG_NEED_SB,
 	},
 	{
 		.cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
@@ -2406,8 +2422,7 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
-				  DEVLINK_NL_FLAG_NEED_SB |
-				  DEVLINK_NL_FLAG_LOCK_PORTS,
+				  DEVLINK_NL_FLAG_NEED_SB,
 	},
 	{
 		.cmd = DEVLINK_CMD_ESWITCH_GET,
@@ -2488,6 +2503,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+	mutex_init(&devlink->lock);
 	return devlink;
 }
 EXPORT_SYMBOL_GPL(devlink_alloc);
@@ -2550,16 +2566,16 @@ int devlink_port_register(struct devlink *devlink,
 			  struct devlink_port *devlink_port,
 			  unsigned int port_index)
 {
-	mutex_lock(&devlink_port_mutex);
+	mutex_lock(&devlink->lock);
 	if (devlink_port_index_exists(devlink, port_index)) {
-		mutex_unlock(&devlink_port_mutex);
+		mutex_unlock(&devlink->lock);
 		return -EEXIST;
 	}
 	devlink_port->devlink = devlink;
 	devlink_port->index = port_index;
 	devlink_port->registered = true;
 	list_add_tail(&devlink_port->list, &devlink->port_list);
-	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink->lock);
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 	return 0;
 }
@@ -2572,10 +2588,12 @@ EXPORT_SYMBOL_GPL(devlink_port_register);
  */
 void devlink_port_unregister(struct devlink_port *devlink_port)
 {
+	struct devlink *devlink = devlink_port->devlink;
+
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
-	mutex_lock(&devlink_port_mutex);
+	mutex_lock(&devlink->lock);
 	list_del(&devlink_port->list);
-	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_port_unregister);
 
@@ -2651,7 +2669,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 	struct devlink_sb *devlink_sb;
 	int err = 0;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	if (devlink_sb_index_exists(devlink, sb_index)) {
 		err = -EEXIST;
 		goto unlock;
@@ -2670,7 +2688,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 	devlink_sb->egress_tc_count = egress_tc_count;
 	list_add_tail(&devlink_sb->list, &devlink->sb_list);
 unlock:
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_sb_register);
@@ -2679,11 +2697,11 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
 {
 	struct devlink_sb *devlink_sb;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
 	WARN_ON(!devlink_sb);
 	list_del(&devlink_sb->list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	kfree(devlink_sb);
 }
 EXPORT_SYMBOL_GPL(devlink_sb_unregister);
@@ -2699,9 +2717,9 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
 int devlink_dpipe_headers_register(struct devlink *devlink,
 				   struct devlink_dpipe_headers *dpipe_headers)
 {
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink->dpipe_headers = dpipe_headers;
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
@@ -2715,9 +2733,9 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
  */
 void devlink_dpipe_headers_unregister(struct devlink *devlink)
 {
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink->dpipe_headers = NULL;
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister);
 
@@ -2783,9 +2801,9 @@ int devlink_dpipe_table_register(struct devlink *devlink,
 	table->priv = priv;
 	table->counter_control_extern = counter_control_extern;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
@@ -2801,17 +2819,17 @@ void devlink_dpipe_table_unregister(struct devlink *devlink,
 {
 	struct devlink_dpipe_table *table;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
 					 table_name);
 	if (!table)
 		goto unlock;
 	list_del_rcu(&table->list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	kfree_rcu(table, rcu);
 	return;
 unlock:
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
 
-- 
cgit 


From d9f9b9a4d05fab693fd23a9ecaa330e03ebe2c31 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:03 +0100
Subject: devlink: Add support for resource abstraction

Add support for hardware resource abstraction over devlink. Each resource
is identified via id, furthermore it contains information regarding its
size and its related sub resources. Each resource can also provide its
current occupancy.

In some cases the sizes of some resources can be changed, yet for those
changes to take place a hot driver reload may be needed. The reload
capability will be introduced in the next patch.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 374 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 374 insertions(+)

(limited to 'net/core')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2f71734c4ff6..89b3704fa450 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2288,6 +2288,233 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
 						counters_enable);
 }
 
+struct devlink_resource *
+devlink_resource_find(struct devlink *devlink,
+		      struct devlink_resource *resource, u64 resource_id)
+{
+	struct list_head *resource_list;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	list_for_each_entry(resource, resource_list, list) {
+		struct devlink_resource *child_resource;
+
+		if (resource->id == resource_id)
+			return resource;
+
+		child_resource = devlink_resource_find(devlink, resource,
+						       resource_id);
+		if (child_resource)
+			return child_resource;
+	}
+	return NULL;
+}
+
+void devlink_resource_validate_children(struct devlink_resource *resource)
+{
+	struct devlink_resource *child_resource;
+	bool size_valid = true;
+	u64 parts_size = 0;
+
+	if (list_empty(&resource->resource_list))
+		goto out;
+
+	list_for_each_entry(child_resource, &resource->resource_list, list)
+		parts_size += child_resource->size_new;
+
+	if (parts_size > resource->size)
+		size_valid = false;
+out:
+	resource->size_valid = size_valid;
+}
+
+static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
+				       struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_resource *resource;
+	u64 resource_id;
+	u64 size;
+	int err;
+
+	if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
+	    !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
+		return -EINVAL;
+	resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
+
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource)
+		return -EINVAL;
+
+	if (!resource->resource_ops->size_validate)
+		return -EINVAL;
+
+	size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
+	err = resource->resource_ops->size_validate(devlink, size,
+						    info->extack);
+	if (err)
+		return err;
+
+	resource->size_new = size;
+	devlink_resource_validate_children(resource);
+	if (resource->parent)
+		devlink_resource_validate_children(resource->parent);
+	return 0;
+}
+
+static void
+devlink_resource_size_params_put(struct devlink_resource *resource,
+				 struct sk_buff *skb)
+{
+	struct devlink_resource_size_params *size_params;
+
+	size_params = resource->size_params;
+	nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
+			  size_params->size_granularity, DEVLINK_ATTR_PAD);
+	nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
+			  size_params->size_max, DEVLINK_ATTR_PAD);
+	nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
+			  size_params->size_min, DEVLINK_ATTR_PAD);
+	nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit);
+}
+
+static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
+				struct devlink_resource *resource)
+{
+	struct devlink_resource *child_resource;
+	struct nlattr *child_resource_attr;
+	struct nlattr *resource_attr;
+
+	resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE);
+	if (!resource_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) ||
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size,
+			      DEVLINK_ATTR_PAD) ||
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id,
+			      DEVLINK_ATTR_PAD))
+		goto nla_put_failure;
+	if (resource->size != resource->size_new)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
+				  resource->size_new, DEVLINK_ATTR_PAD);
+	if (resource->resource_ops && resource->resource_ops->occ_get)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
+				  resource->resource_ops->occ_get(devlink),
+				  DEVLINK_ATTR_PAD);
+	devlink_resource_size_params_put(resource, skb);
+	if (list_empty(&resource->resource_list))
+		goto out;
+
+	if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID,
+		       resource->size_valid))
+		goto nla_put_failure;
+
+	child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+	if (!child_resource_attr)
+		goto nla_put_failure;
+
+	list_for_each_entry(child_resource, &resource->resource_list, list) {
+		if (devlink_resource_put(devlink, skb, child_resource))
+			goto resource_put_failure;
+	}
+
+	nla_nest_end(skb, child_resource_attr);
+out:
+	nla_nest_end(skb, resource_attr);
+	return 0;
+
+resource_put_failure:
+	nla_nest_cancel(skb, child_resource_attr);
+nla_put_failure:
+	nla_nest_cancel(skb, resource_attr);
+	return -EMSGSIZE;
+}
+
+static int devlink_resource_fill(struct genl_info *info,
+				 enum devlink_command cmd, int flags)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_resource *resource;
+	struct nlattr *resources_attr;
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	bool incomplete;
+	void *hdr;
+	int i;
+	int err;
+
+	resource = list_first_entry(&devlink->resource_list,
+				    struct devlink_resource, list);
+start_again:
+	err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+	if (err)
+		return err;
+
+	hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+			  &devlink_nl_family, NLM_F_MULTI, cmd);
+	if (!hdr) {
+		nlmsg_free(skb);
+		return -EMSGSIZE;
+	}
+
+	if (devlink_nl_put_handle(skb, devlink))
+		goto nla_put_failure;
+
+	resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+	if (!resources_attr)
+		goto nla_put_failure;
+
+	incomplete = false;
+	i = 0;
+	list_for_each_entry_from(resource, &devlink->resource_list, list) {
+		err = devlink_resource_put(devlink, skb, resource);
+		if (err) {
+			if (!i)
+				goto err_resource_put;
+			incomplete = true;
+			break;
+		}
+		i++;
+	}
+	nla_nest_end(skb, resources_attr);
+	genlmsg_end(skb, hdr);
+	if (incomplete)
+		goto start_again;
+send_done:
+	nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+			NLMSG_DONE, 0, flags | NLM_F_MULTI);
+	if (!nlh) {
+		err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+		if (err)
+			goto err_skb_send_alloc;
+		goto send_done;
+	}
+	return genlmsg_reply(skb, info);
+
+nla_put_failure:
+	err = -EMSGSIZE;
+err_resource_put:
+err_skb_send_alloc:
+	genlmsg_cancel(skb, hdr);
+	nlmsg_free(skb);
+	return err;
+}
+
+static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+
+	if (list_empty(&devlink->resource_list))
+		return -EOPNOTSUPP;
+
+	return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2306,6 +2533,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
+	[DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
+	[DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -2466,6 +2695,20 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_RESOURCE_SET,
+		.doit = devlink_nl_cmd_resource_set,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
+	{
+		.cmd = DEVLINK_CMD_RESOURCE_DUMP,
+		.doit = devlink_nl_cmd_resource_dump,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
@@ -2503,6 +2746,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+	INIT_LIST_HEAD(&devlink->resource_list);
 	mutex_init(&devlink->lock);
 	return devlink;
 }
@@ -2833,6 +3077,136 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
 
+/**
+ *	devlink_resource_register - devlink resource register
+ *
+ *	@devlink: devlink
+ *	@resource_name: resource's name
+ *	@top_hierarchy: top hierarchy
+ *	@reload_required: reload is required for new configuration to
+ *			  apply
+ *	@resource_size: resource's size
+ *	@resource_id: resource's id
+ *	@parent_reosurce_id: resource's parent id
+ *	@size params: size parameters
+ *	@resource_ops: resource ops
+ */
+int devlink_resource_register(struct devlink *devlink,
+			      const char *resource_name,
+			      bool top_hierarchy,
+			      u64 resource_size,
+			      u64 resource_id,
+			      u64 parent_resource_id,
+			      struct devlink_resource_size_params *size_params,
+			      const struct devlink_resource_ops *resource_ops)
+{
+	struct devlink_resource *resource;
+	struct list_head *resource_list;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (resource) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (top_hierarchy) {
+		resource_list = &devlink->resource_list;
+	} else {
+		struct devlink_resource *parent_resource;
+
+		parent_resource = devlink_resource_find(devlink, NULL,
+							parent_resource_id);
+		if (parent_resource) {
+			resource_list = &parent_resource->resource_list;
+			resource->parent = parent_resource;
+		} else {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	resource->name = resource_name;
+	resource->size = resource_size;
+	resource->size_new = resource_size;
+	resource->id = resource_id;
+	resource->resource_ops = resource_ops;
+	resource->size_valid = true;
+	resource->size_params = size_params;
+	INIT_LIST_HEAD(&resource->resource_list);
+	list_add_tail(&resource->list, resource_list);
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_register);
+
+/**
+ *	devlink_resources_unregister - free all resources
+ *
+ *	@devlink: devlink
+ *	@resource: resource
+ */
+void devlink_resources_unregister(struct devlink *devlink,
+				  struct devlink_resource *resource)
+{
+	struct devlink_resource *tmp, *child_resource;
+	struct list_head *resource_list;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	if (!resource)
+		mutex_lock(&devlink->lock);
+
+	list_for_each_entry_safe(child_resource, tmp, resource_list, list) {
+		devlink_resources_unregister(devlink, child_resource);
+		list_del(&child_resource->list);
+		kfree(child_resource);
+	}
+
+	if (!resource)
+		mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_resources_unregister);
+
+/**
+ *	devlink_resource_size_get - get and update size
+ *
+ *	@devlink: devlink
+ *	@resource_id: the requested resource id
+ *	@p_resource_size: ptr to update
+ */
+int devlink_resource_size_get(struct devlink *devlink,
+			      u64 resource_id,
+			      u64 *p_resource_size)
+{
+	struct devlink_resource *resource;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource) {
+		err = -EINVAL;
+		goto out;
+	}
+	*p_resource_size = resource->size_new;
+	resource->size = resource->size_new;
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_size_get);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
cgit 


From 2d8dc5bbf4e7603747875eb5cadcd67c1fa8b1bb Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:04 +0100
Subject: devlink: Add support for reload

Add support for performing driver hot reload.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'net/core')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 89b3704fa450..4c3d85560436 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2515,6 +2515,45 @@ static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
 	return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
 }
 
+static int
+devlink_resources_validate(struct devlink *devlink,
+			   struct devlink_resource *resource,
+			   struct genl_info *info)
+{
+	struct list_head *resource_list;
+	int err = 0;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	list_for_each_entry(resource, resource_list, list) {
+		if (!resource->size_valid)
+			return -EINVAL;
+		err = devlink_resources_validate(devlink, resource, info);
+		if (err)
+			return err;
+	}
+	return err;
+}
+
+static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	int err;
+
+	if (!devlink->ops->reload)
+		return -EOPNOTSUPP;
+
+	err = devlink_resources_validate(devlink, NULL, info);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
+		return err;
+	}
+	return devlink->ops->reload(devlink);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2709,6 +2748,14 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_RELOAD,
+		.doit = devlink_nl_cmd_reload,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit 


From 56dc7cd0a87a1ff4f49ee1e67bd88e768385d51a Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:05 +0100
Subject: devlink: Add relation between dpipe and resource

The hardware processes which are modeled via dpipe commonly use some
internal hardware resources. Such relation can improve the understanding
of hardware limitations. The number of resource's unit consumed per
table's entry are also provided for each table.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'net/core')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 4c3d85560436..dd7d6dd07bfb 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1694,6 +1694,12 @@ static int devlink_dpipe_table_put(struct sk_buff *skb,
 		       table->counters_enabled))
 		goto nla_put_failure;
 
+	if (table->resource_valid) {
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
+				  table->resource_id, DEVLINK_ATTR_PAD);
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
+				  table->resource_units, DEVLINK_ATTR_PAD);
+	}
 	if (devlink_dpipe_matches_put(table, skb))
 		goto nla_put_failure;
 
@@ -3254,6 +3260,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(devlink_resource_size_get);
 
+/**
+ *	devlink_dpipe_table_resource_set - set the resource id
+ *
+ *	@devlink: devlink
+ *	@table_name: table name
+ *	@resource_id: resource id
+ *	@resource_units: number of resource's units consumed per table's entry
+ */
+int devlink_dpipe_table_resource_set(struct devlink *devlink,
+				     const char *table_name, u64 resource_id,
+				     u64 resource_units)
+{
+	struct devlink_dpipe_table *table;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+					 table_name);
+	if (!table) {
+		err = -EINVAL;
+		goto out;
+	}
+	table->resource_id = resource_id;
+	table->resource_units = resource_units;
+	table->resource_valid = true;
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
cgit 


From 96890d62523c2cddc2c053ad29de35c4d935cf11 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jan 2018 00:42:40 +0300
Subject: net: delete /proc THIS_MODULE references

/proc has been ignoring struct file_operations::owner field for 10 years.
Specifically, it started with commit 786d7e1612f0b0adb6046f19b906609e4fe8b1ba
("Fix rmmod/read/write races in /proc entries"). Notice the chunk where
inode->i_fop is initialized with proxy struct file_operations for
regular files:

	-               if (de->proc_fops)
	-                       inode->i_fop = de->proc_fops;
	+               if (de->proc_fops) {
	+                       if (S_ISREG(inode->i_mode))
	+                               inode->i_fop = &proc_reg_file_ops;
	+                       else
	+                               inode->i_fop = de->proc_fops;
	+               }

VFS stopped pinning module at this point.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c  | 1 -
 net/core/net-procfs.c | 4 ----
 net/core/pktgen.c     | 3 ---
 net/core/sock.c       | 1 -
 4 files changed, 9 deletions(-)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d1f5fe986edd..f96f9f58b894 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2862,7 +2862,6 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file)
 };
 
 static const struct file_operations neigh_stat_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open 	 = neigh_stat_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 615ccab55f38..e010bb800d7b 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -182,7 +182,6 @@ static int dev_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations dev_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = dev_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -202,7 +201,6 @@ static int softnet_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations softnet_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = softnet_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -306,7 +304,6 @@ static int ptype_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations ptype_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = ptype_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -387,7 +384,6 @@ static int dev_mc_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations dev_mc_seq_fops = {
-	.owner	 = THIS_MODULE,
 	.open    = dev_mc_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index b9ce241cd28c..4fcfcb14e7c6 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -523,7 +523,6 @@ static int pgctrl_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations pktgen_fops = {
-	.owner   = THIS_MODULE,
 	.open    = pgctrl_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -1804,7 +1803,6 @@ static int pktgen_if_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations pktgen_if_fops = {
-	.owner   = THIS_MODULE,
 	.open    = pktgen_if_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -1942,7 +1940,6 @@ static int pktgen_thread_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations pktgen_thread_fops = {
-	.owner   = THIS_MODULE,
 	.open    = pktgen_thread_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/net/core/sock.c b/net/core/sock.c
index 72d14b221784..abf4cbff99b2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3362,7 +3362,6 @@ static int proto_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations proto_seq_fops = {
-	.owner		= THIS_MODULE,
 	.open		= proto_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-- 
cgit 


From 0c06bea919f3289368a023d1a62a1bc319617fa3 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 16 Jan 2018 12:31:41 +0300
Subject: net: Fix possible race in peernet2id_alloc()

peernet2id_alloc() is racy without rtnl_lock() as refcount_read(&peer->count)
under net->nsid_lock does not guarantee, peer is alive:

rcu_read_lock()
peernet2id_alloc()                            ..
  spin_lock_bh(&net->nsid_lock)               ..
  refcount_read(&peer->count) (!= 0)          ..
  ..                                          put_net()
  ..                                            cleanup_net()
  ..                                              for_each_net(tmp)
  ..                                                spin_lock_bh(&tmp->nsid_lock)
  ..                                                __peernet2id(tmp, net) == -1
  ..                                                    ..
  ..                                                    ..
    __peernet2id_alloc(alloc == true)                   ..
  ..                                                    ..
rcu_read_unlock()                                       ..
..                                                synchronize_rcu()
..                                                kmem_cache_free(net)

After the above situation, net::netns_id contains id pointing to freed memory,
and any other dereferencing by the id will operate with this freed memory.

Currently, peernet2id_alloc() is used under rtnl_lock() everywhere except
ovs_vport_cmd_fill_info(), and this race can't occur. But peernet2id_alloc()
is generic interface, and better we fix it before someone really starts
use it in wrong context.

v2: Don't place refcount_read(&net->count) under net->nsid_lock
    as suggested by Eric W. Biederman <ebiederm@xmission.com>
v3: Rebase on top of net-next

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2213d45fcafd..3c77d84ad60d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -221,17 +221,26 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id);
  */
 int peernet2id_alloc(struct net *net, struct net *peer)
 {
-	bool alloc;
+	bool alloc = false, alive = false;
 	int id;
 
 	if (refcount_read(&net->count) == 0)
 		return NETNSA_NSID_NOT_ASSIGNED;
 	spin_lock_bh(&net->nsid_lock);
-	alloc = refcount_read(&peer->count) == 0 ? false : true;
+	/*
+	 * When peer is obtained from RCU lists, we may race with
+	 * its cleanup. Check whether it's alive, and this guarantees
+	 * we never hash a peer back to net->netns_ids, after it has
+	 * just been idr_remove()'d from there in cleanup_net().
+	 */
+	if (maybe_get_net(peer))
+		alive = alloc = true;
 	id = __peernet2id_alloc(net, peer, &alloc);
 	spin_unlock_bh(&net->nsid_lock);
 	if (alloc && id >= 0)
 		rtnl_net_notifyid(net, RTM_NEWNSID, id);
+	if (alive)
+		put_net(peer);
 	return id;
 }
 EXPORT_SYMBOL_GPL(peernet2id_alloc);
-- 
cgit 


From 42157277af17d5c05946c700eb03877d60760d3c Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 16 Jan 2018 12:31:54 +0300
Subject: net: Remove spinlock from get_net_ns_by_id()

idr_find() is safe under rcu_read_lock() and
maybe_get_net() guarantees that net is alive.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3c77d84ad60d..1ccb953b3b09 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -273,11 +273,9 @@ struct net *get_net_ns_by_id(struct net *net, int id)
 		return NULL;
 
 	rcu_read_lock();
-	spin_lock_bh(&net->nsid_lock);
 	peer = idr_find(&net->netns_ids, id);
 	if (peer)
 		peer = maybe_get_net(peer);
-	spin_unlock_bh(&net->nsid_lock);
 	rcu_read_unlock();
 
 	return peer;
-- 
cgit 


From 61f3c964dfd287b05d7ac6660a4f4ddfef84786c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 17 Jan 2018 16:52:02 -0800
Subject: bpf: allow socket_filter programs to use bpf_prog_test_run

in order to improve test coverage allow socket_filter program type
to be run via bpf_prog_test_run command.
Since such programs can be loaded by non-root tighten
permissions for bpf_prog_test_run to be root only
to avoid surprises.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index db2ee8c7e1bd..30fafaaa90fa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4526,6 +4526,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {
 };
 
 const struct bpf_prog_ops sk_filter_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
 };
 
 const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
-- 
cgit 


From 43dd7512b51c0f3dd8170916bd3eeb2eba808ed1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 17 Jan 2018 03:27:42 +0000
Subject: devlink: Make some functions static

Fixes the following sparse warnings:

net/core/devlink.c:2297:25: warning:
 symbol 'devlink_resource_find' was not declared. Should it be static?
net/core/devlink.c:2322:6: warning:
 symbol 'devlink_resource_validate_children' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index dd7d6dd07bfb..66d36705fb9d 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2294,7 +2294,7 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
 						counters_enable);
 }
 
-struct devlink_resource *
+static struct devlink_resource *
 devlink_resource_find(struct devlink *devlink,
 		      struct devlink_resource *resource, u64 resource_id)
 {
@@ -2319,7 +2319,8 @@ devlink_resource_find(struct devlink *devlink,
 	return NULL;
 }
 
-void devlink_resource_validate_children(struct devlink_resource *resource)
+static void
+devlink_resource_validate_children(struct devlink_resource *resource)
 {
 	struct devlink_resource *child_resource;
 	bool size_valid = true;
-- 
cgit 


From 205c380778d09291668da5267057a80385f89437 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:30 +0100
Subject: bpf: add csum_diff helper to xdp as well

Useful for porting cls_bpf programs w/o increasing program
complexity limits much at the same time, so add the helper
to XDP as well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 30fafaaa90fa..e5178851536f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3456,6 +3456,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_xdp_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
 	case BPF_FUNC_xdp_adjust_meta:
-- 
cgit 


From fa9dd599b4dae841924b022768354cfde9affecb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:33 +0100
Subject: bpf: get rid of pure_initcall dependency to enable jits

Having a pure_initcall() callback just to permanently enable BPF
JITs under CONFIG_BPF_JIT_ALWAYS_ON is unnecessary and could leave
a small race window in future where JIT is still disabled on boot.
Since we know about the setting at compilation time anyway, just
initialize it properly there. Also consolidate all the individual
bpf_jit_enable variables into a single one and move them under one
location. Moreover, don't allow for setting unspecified garbage
values on them.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/sysctl_net_core.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a47ad6cd41c0..6d39b4c01fc6 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -25,6 +25,7 @@
 
 static int zero = 0;
 static int one = 1;
+static int two __maybe_unused = 2;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
@@ -325,13 +326,14 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
-		.proc_handler	= proc_dointvec
-#else
 		.proc_handler	= proc_dointvec_minmax,
+# ifdef CONFIG_BPF_JIT_ALWAYS_ON
 		.extra1		= &one,
 		.extra2		= &one,
-#endif
+# else
+		.extra1		= &zero,
+		.extra2		= &two,
+# endif
 	},
 # ifdef CONFIG_HAVE_EBPF_JIT
 	{
@@ -339,14 +341,18 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_harden,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
 	},
 	{
 		.procname	= "bpf_jit_kallsyms",
 		.data		= &bpf_jit_kallsyms,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
 	},
 # endif
 #endif
-- 
cgit 


From 2e4a30983b0f9b19b59e38bbf7427d7fdd480d98 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:34 +0100
Subject: bpf: restrict access to core bpf sysctls

Given BPF reaches far beyond just networking these days, it was
never intended to allow setting and in some cases reading those
knobs out of a user namespace root running without CAP_SYS_ADMIN,
thus tighten such access.

Also the bpf_jit_enable = 2 debugging mode should only be allowed
if kptr_restrict is not set since it otherwise can leak addresses
to the kernel log. Dump a note to the kernel log that this is for
debugging JITs only when enabled.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/sysctl_net_core.c | 46 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 6d39b4c01fc6..f2d0462611c3 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -251,6 +251,46 @@ static int proc_do_rss_key(struct ctl_table *table, int write,
 	return proc_dostring(&fake_table, write, buffer, lenp, ppos);
 }
 
+#ifdef CONFIG_BPF_JIT
+static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
+					   void __user *buffer, size_t *lenp,
+					   loff_t *ppos)
+{
+	int ret, jit_enable = *(int *)table->data;
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	tmp.data = &jit_enable;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret) {
+		if (jit_enable < 2 ||
+		    (jit_enable == 2 && bpf_dump_raw_ok())) {
+			*(int *)table->data = jit_enable;
+			if (jit_enable == 2)
+				pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
+		} else {
+			ret = -EPERM;
+		}
+	}
+	return ret;
+}
+
+# ifdef CONFIG_HAVE_EBPF_JIT
+static int
+proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+				    void __user *buffer, size_t *lenp,
+				    loff_t *ppos)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+# endif
+#endif
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -326,7 +366,7 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax_bpf_enable,
 # ifdef CONFIG_BPF_JIT_ALWAYS_ON
 		.extra1		= &one,
 		.extra2		= &one,
@@ -341,7 +381,7 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_harden,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
@@ -350,7 +390,7 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_kallsyms,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-- 
cgit 


From 1728a4f2ad6840746a6b1b9f01d652c5842f7e8d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:37 +0100
Subject: bpf: move event_output to const_size_or_zero for xdp/skb as well

Similar rationale as in a60dd35d2e39 ("bpf: change bpf_perf_event_output
arg5 type to ARG_CONST_SIZE_OR_ZERO"), change the type to CONST_SIZE_OR_ZERO
such that we can better deal with optimized code. No changes needed in
bpf_event_output() as it can also deal with 0 size entirely (e.g. as only
wake-up signal with empty frame in perf RB, or packet dumps w/o meta data
as another such possibility).

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index e5178851536f..9b9b70dee208 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2861,7 +2861,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
 static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -3150,7 +3150,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
-- 
cgit 


From b75703de16301b80f1eedecafdf37bb02c9e155f Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 22 Jan 2018 10:31:19 +0000
Subject: devlink: fix memory leak on 'resource'

Currently, if the call to devlink_resource_find returns null then
the error exit path does not free the devlink_resource 'resource'
and a memory leak occurs. Fix this by kfree'ing resource on the
error exit path.

Detected by CoverityScan, CID#1464184 ("Resource leak")

Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 66d36705fb9d..18d385ed8237 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3182,6 +3182,7 @@ int devlink_resource_register(struct devlink *devlink,
 			resource_list = &parent_resource->resource_list;
 			resource->parent = parent_resource;
 		} else {
+			kfree(resource);
 			err = -EINVAL;
 			goto out;
 		}
-- 
cgit 


From b2d3bcfa26a7a8de41f358a6cae8b848673b3c6e Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Thu, 18 Jan 2018 09:59:13 -0800
Subject: net: core: Expose number of link up/down transitions

Expose the number of times the link has been going UP or DOWN, and
update the "carrier_changes" counter to be the sum of these two events.
While at it, also update the sysfs-class-net documentation to cover:
carrier_changes (3.15), carrier_up_count (4.16) and carrier_down_count
(4.16)

Signed-off-by: David Decotigny <decot@googlers.com>
[Florian:
* rebase
* add documentation
* merge carrier_changes with up/down counters]
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 25 ++++++++++++++++++++++++-
 net/core/rtnetlink.c | 13 +++++++++++--
 2 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7bf8b85ade16..c4a28f4667b6 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -295,10 +295,31 @@ static ssize_t carrier_changes_show(struct device *dev,
 	struct net_device *netdev = to_net_dev(dev);
 
 	return sprintf(buf, fmt_dec,
-		       atomic_read(&netdev->carrier_changes));
+		       atomic_read(&netdev->carrier_up_count) +
+		       atomic_read(&netdev->carrier_down_count));
 }
 static DEVICE_ATTR_RO(carrier_changes);
 
+static ssize_t carrier_up_count_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+
+	return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
+}
+static DEVICE_ATTR_RO(carrier_up_count);
+
+static ssize_t carrier_down_count_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+
+	return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
+}
+static DEVICE_ATTR_RO(carrier_down_count);
+
 /* read-write attributes */
 
 static int change_mtu(struct net_device *dev, unsigned long new_mtu)
@@ -547,6 +568,8 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_phys_port_name.attr,
 	&dev_attr_phys_switch_id.attr,
 	&dev_attr_proto_down.attr,
+	&dev_attr_carrier_up_count.attr,
+	&dev_attr_carrier_down_count.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(net_class);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 16d644a4f974..97874daa1336 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -990,6 +990,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
 	       + nla_total_size(1)  /* IFLA_PROTO_DOWN */
 	       + nla_total_size(4)  /* IFLA_IF_NETNSID */
+	       + nla_total_size(4)  /* IFLA_CARRIER_UP_COUNT */
+	       + nla_total_size(4)  /* IFLA_CARRIER_DOWN_COUNT */
 	       + 0;
 }
 
@@ -1551,8 +1553,13 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
 	    nla_put_ifalias(skb, dev) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
-			atomic_read(&dev->carrier_changes)) ||
-	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
+			atomic_read(&dev->carrier_up_count) +
+			atomic_read(&dev->carrier_down_count)) ||
+	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) ||
+	    nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
+			atomic_read(&dev->carrier_up_count)) ||
+	    nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
+			atomic_read(&dev->carrier_down_count)))
 		goto nla_put_failure;
 
 	if (event != IFLA_EVENT_NONE) {
@@ -1656,6 +1663,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_EVENT]		= { .type = NLA_U32 },
 	[IFLA_GROUP]		= { .type = NLA_U32 },
 	[IFLA_IF_NETNSID]	= { .type = NLA_S32 },
+	[IFLA_CARRIER_UP_COUNT]	= { .type = NLA_U32 },
+	[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
-- 
cgit 


From 5de30d5df95cfda14dfdbd6f8ed5021ab13be79b Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 22 Jan 2018 19:14:27 -0800
Subject: net: core: Fix kernel-doc for call_netdevice_notifiers_info()

Remove the @dev comment, since we do not have a net_device argument, fixes the
following kernel doc warning: /net/core/dev.c:1707: warning: Excess function
parameter 'dev' description in 'call_netdevice_notifiers_info'

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 94435cd09072..7af0ef425ca3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1694,7 +1694,6 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
 /**
  *	call_netdevice_notifiers_info - call all network notifier blocks
  *	@val: value passed unmodified to notifier function
- *	@dev: net_device pointer passed unmodified to notifier function
  *	@info: notifier information data
  *
  *	Call all network notifier blocks.  Parameters and return value
-- 
cgit 


From 7a006d5988ebd99922784176d902a335b8eb5321 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 22 Jan 2018 19:14:28 -0800
Subject: net: core: Fix kernel-doc for netdev_upper_link()

Fixes the following warnings:
./net/core/dev.c:6438: warning: No description found for parameter 'extack'
./net/core/dev.c:6461: warning: No description found for parameter 'extack'

Fixes: 42ab19ee9029 ("net: Add extack to upper device linking")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 7af0ef425ca3..77795f66c246 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6424,6 +6424,7 @@ rollback:
  * netdev_upper_dev_link - Add a link to the upper device
  * @dev: device
  * @upper_dev: new upper device
+ * @extack: netlink extended ack
  *
  * Adds a link to device which is upper to this one. The caller must hold
  * the RTNL lock. On a failure a negative errno code is returned.
@@ -6445,6 +6446,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  * @upper_dev: new upper device
  * @upper_priv: upper device private
  * @upper_info: upper info to be passed down via notifier
+ * @extack: netlink extended ack
  *
  * Adds a link to device which is upper to this one. In this case, only
  * one master upper device can be linked, although other non-master devices
-- 
cgit 


From b76f4189df5c54a892ae54ac23908cc54ae7134f Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Mon, 22 Jan 2018 08:07:19 -0800
Subject: net: link_watch: mark bonding link events urgent

It takes 1sec for bond link down notification to hit user-space
when all slaves of the bond go down. 1sec is too long for
protocol daemons in user-space relying on bond notification
to recover (eg: multichassis lag implementations in user-space).
Since the link event code already marks team device port link events
 as urgent, this patch moves the code to cover all lag ports and master.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/link_watch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 982861607f88..e38e641e98d5 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -92,7 +92,7 @@ static bool linkwatch_urgent_event(struct net_device *dev)
 	if (dev->ifindex != dev_get_iflink(dev))
 		return true;
 
-	if (dev->priv_flags & IFF_TEAM_PORT)
+	if (netif_is_lag_port(dev) || netif_is_lag_master(dev))
 		return true;
 
 	return netif_carrier_ok(dev) &&	qdisc_tx_changing(dev);
-- 
cgit 


From 57a5749b0fa3639a228d7c0ac080ee1704abe8fd Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Thu, 18 Jan 2018 18:31:34 +0000
Subject: pktgen: Add missing !flag parameters

o FLOW_SEQ now can be disabled with pgset "flag !FLOW_SEQ"
o FLOW_SEQ and FLOW_RND are antonyms, as it's shown by pktgen_if_show()
o IPSEC now may be disabled

Note, that IPV6 is enabled with dst6/src6 parameters, not with
a flag parameter.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 4fcfcb14e7c6..20f1c873a1ed 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1284,9 +1284,12 @@ static ssize_t pktgen_if_write(struct file *file,
 		else if (strcmp(f, "!SVID_RND") == 0)
 			pkt_dev->flags &= ~F_SVID_RND;
 
-		else if (strcmp(f, "FLOW_SEQ") == 0)
+		else if (strcmp(f, "FLOW_SEQ") == 0 || strcmp(f, "!FLOW_RND") == 0)
 			pkt_dev->flags |= F_FLOW_SEQ;
 
+		else if (strcmp(f, "FLOW_RND") == 0 || strcmp(f, "!FLOW_SEQ") == 0)
+			pkt_dev->flags &= ~F_FLOW_SEQ;
+
 		else if (strcmp(f, "QUEUE_MAP_RND") == 0)
 			pkt_dev->flags |= F_QUEUE_MAP_RND;
 
@@ -1301,6 +1304,9 @@ static ssize_t pktgen_if_write(struct file *file,
 #ifdef CONFIG_XFRM
 		else if (strcmp(f, "IPSEC") == 0)
 			pkt_dev->flags |= F_IPSEC_ON;
+
+		else if (strcmp(f, "!IPSEC") == 0)
+			pkt_dev->flags &= ~F_IPSEC_ON;
 #endif
 
 		else if (strcmp(f, "!IPV6") == 0)
-- 
cgit 


From 6f107c741212ca5fbfb0724571395716420017d6 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Thu, 18 Jan 2018 18:31:35 +0000
Subject: pktgen: Add behaviour flags macro to generate flags/names

PKT_FALGS macro will be used to add package behavior names definitions
to simplify the code that prints/reads pkg flags.
Sorted the array in order of printing the flags in pktgen_if_show()
Note: Renamed IPSEC_ON => IPSEC for simplicity.

No visible behavior change expected.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 57 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 23 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 20f1c873a1ed..e6d862214cc0 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -184,25 +184,36 @@
 
 #define func_enter() pr_debug("entering %s\n", __func__);
 
+#define PKT_FLAGS							\
+	pf(IPV6)		/* Interface in IPV6 Mode */		\
+	pf(IPSRC_RND)		/* IP-Src Random  */			\
+	pf(IPDST_RND)		/* IP-Dst Random  */			\
+	pf(TXSIZE_RND)		/* Transmit size is random */		\
+	pf(UDPSRC_RND)		/* UDP-Src Random */			\
+	pf(UDPDST_RND)		/* UDP-Dst Random */			\
+	pf(UDPCSUM)		/* Include UDP checksum */		\
+	pf(NO_TIMESTAMP)	/* Don't timestamp packets (default TS) */ \
+	pf(MPLS_RND)		/* Random MPLS labels */		\
+	pf(QUEUE_MAP_RND)	/* queue map Random */			\
+	pf(QUEUE_MAP_CPU)	/* queue map mirrors smp_processor_id() */ \
+	pf(FLOW_SEQ)		/* Sequential flows */			\
+	pf(IPSEC)		/* ipsec on for flows */		\
+	pf(MACSRC_RND)		/* MAC-Src Random */			\
+	pf(MACDST_RND)		/* MAC-Dst Random */			\
+	pf(VID_RND)		/* Random VLAN ID */			\
+	pf(SVID_RND)		/* Random SVLAN ID */			\
+	pf(NODE)		/* Node memory alloc*/			\
+
+#define pf(flag)		flag##_SHIFT,
+enum pkt_flags {
+	PKT_FLAGS
+};
+#undef pf
+
 /* Device flag bits */
-#define F_IPSRC_RND   (1<<0)	/* IP-Src Random  */
-#define F_IPDST_RND   (1<<1)	/* IP-Dst Random  */
-#define F_UDPSRC_RND  (1<<2)	/* UDP-Src Random */
-#define F_UDPDST_RND  (1<<3)	/* UDP-Dst Random */
-#define F_MACSRC_RND  (1<<4)	/* MAC-Src Random */
-#define F_MACDST_RND  (1<<5)	/* MAC-Dst Random */
-#define F_TXSIZE_RND  (1<<6)	/* Transmit size is random */
-#define F_IPV6        (1<<7)	/* Interface in IPV6 Mode */
-#define F_MPLS_RND    (1<<8)	/* Random MPLS labels */
-#define F_VID_RND     (1<<9)	/* Random VLAN ID */
-#define F_SVID_RND    (1<<10)	/* Random SVLAN ID */
-#define F_FLOW_SEQ    (1<<11)	/* Sequential flows */
-#define F_IPSEC_ON    (1<<12)	/* ipsec on for flows */
-#define F_QUEUE_MAP_RND (1<<13)	/* queue map Random */
-#define F_QUEUE_MAP_CPU (1<<14)	/* queue map mirrors smp_processor_id() */
-#define F_NODE          (1<<15)	/* Node memory alloc*/
-#define F_UDPCSUM       (1<<16)	/* Include UDP checksum */
-#define F_NO_TIMESTAMP  (1<<17)	/* Don't timestamp packets (default TS) */
+#define pf(flag)		static const __u32 F_##flag = (1<<flag##_SHIFT);
+PKT_FLAGS
+#undef pf
 
 /* Thread control flag bits */
 #define T_STOP        (1<<0)	/* Stop run */
@@ -672,7 +683,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 	}
 
 #ifdef CONFIG_XFRM
-	if (pkt_dev->flags & F_IPSEC_ON) {
+	if (pkt_dev->flags & F_IPSEC) {
 		seq_puts(seq,  "IPSEC  ");
 		if (pkt_dev->spi)
 			seq_printf(seq, "spi:%u", pkt_dev->spi);
@@ -1303,10 +1314,10 @@ static ssize_t pktgen_if_write(struct file *file,
 			pkt_dev->flags &= ~F_QUEUE_MAP_CPU;
 #ifdef CONFIG_XFRM
 		else if (strcmp(f, "IPSEC") == 0)
-			pkt_dev->flags |= F_IPSEC_ON;
+			pkt_dev->flags |= F_IPSEC;
 
 		else if (strcmp(f, "!IPSEC") == 0)
-			pkt_dev->flags &= ~F_IPSEC_ON;
+			pkt_dev->flags &= ~F_IPSEC;
 #endif
 
 		else if (strcmp(f, "!IPV6") == 0)
@@ -2547,7 +2558,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
 				pkt_dev->flows[flow].cur_daddr =
 				    pkt_dev->cur_daddr;
 #ifdef CONFIG_XFRM
-				if (pkt_dev->flags & F_IPSEC_ON)
+				if (pkt_dev->flags & F_IPSEC)
 					get_ipsec_sa(pkt_dev, flow);
 #endif
 				pkt_dev->nflows++;
@@ -2652,7 +2663,7 @@ static void free_SAs(struct pktgen_dev *pkt_dev)
 static int process_ipsec(struct pktgen_dev *pkt_dev,
 			      struct sk_buff *skb, __be16 protocol)
 {
-	if (pkt_dev->flags & F_IPSEC_ON) {
+	if (pkt_dev->flags & F_IPSEC) {
 		struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
 		int nhead = 0;
 		if (x) {
-- 
cgit 


From 99c6d3d20d6287d7dd6e65686dba9e696de91f90 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Thu, 18 Jan 2018 18:31:36 +0000
Subject: pktgen: Remove brute-force printing of flags

Add macro generated pkt_flag_names array, with a little help of which
the flags can be printed by using an index.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 77 ++++++++++++++-----------------------------------------
 1 file changed, 19 insertions(+), 58 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index e6d862214cc0..17fb036d7297 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -215,6 +215,14 @@ enum pkt_flags {
 PKT_FLAGS
 #undef pf
 
+#define pf(flag)		__stringify(flag),
+static char *pkt_flag_names[] = {
+	PKT_FLAGS
+};
+#undef pf
+
+#define NR_PKT_FLAGS		ARRAY_SIZE(pkt_flag_names)
+
 /* Thread control flag bits */
 #define T_STOP        (1<<0)	/* Stop run */
 #define T_RUN         (1<<1)	/* Start run */
@@ -545,6 +553,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 {
 	const struct pktgen_dev *pkt_dev = seq->private;
 	ktime_t stopped;
+	unsigned int i;
 	u64 idle;
 
 	seq_printf(seq,
@@ -606,7 +615,6 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 		   pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
 
 	if (pkt_dev->nr_labels) {
-		unsigned int i;
 		seq_puts(seq, "     mpls: ");
 		for (i = 0; i < pkt_dev->nr_labels; i++)
 			seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
@@ -642,68 +650,21 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 
 	seq_puts(seq, "     Flags: ");
 
-	if (pkt_dev->flags & F_IPV6)
-		seq_puts(seq, "IPV6  ");
-
-	if (pkt_dev->flags & F_IPSRC_RND)
-		seq_puts(seq, "IPSRC_RND  ");
-
-	if (pkt_dev->flags & F_IPDST_RND)
-		seq_puts(seq, "IPDST_RND  ");
-
-	if (pkt_dev->flags & F_TXSIZE_RND)
-		seq_puts(seq, "TXSIZE_RND  ");
-
-	if (pkt_dev->flags & F_UDPSRC_RND)
-		seq_puts(seq, "UDPSRC_RND  ");
-
-	if (pkt_dev->flags & F_UDPDST_RND)
-		seq_puts(seq, "UDPDST_RND  ");
-
-	if (pkt_dev->flags & F_UDPCSUM)
-		seq_puts(seq, "UDPCSUM  ");
-
-	if (pkt_dev->flags & F_NO_TIMESTAMP)
-		seq_puts(seq, "NO_TIMESTAMP  ");
-
-	if (pkt_dev->flags & F_MPLS_RND)
-		seq_puts(seq,  "MPLS_RND  ");
-
-	if (pkt_dev->flags & F_QUEUE_MAP_RND)
-		seq_puts(seq,  "QUEUE_MAP_RND  ");
-
-	if (pkt_dev->flags & F_QUEUE_MAP_CPU)
-		seq_puts(seq,  "QUEUE_MAP_CPU  ");
+	for (i = 0; i < NR_PKT_FLAGS; i++) {
+		if (i == F_FLOW_SEQ)
+			if (!pkt_dev->cflows)
+				continue;
 
-	if (pkt_dev->cflows) {
-		if (pkt_dev->flags & F_FLOW_SEQ)
-			seq_puts(seq,  "FLOW_SEQ  "); /*in sequence flows*/
-		else
-			seq_puts(seq,  "FLOW_RND  ");
-	}
+		if (pkt_dev->flags & (1 << i))
+			seq_printf(seq, "%s  ", pkt_flag_names[i]);
+		else if (i == F_FLOW_SEQ)
+			seq_puts(seq, "FLOW_RND  ");
 
 #ifdef CONFIG_XFRM
-	if (pkt_dev->flags & F_IPSEC) {
-		seq_puts(seq,  "IPSEC  ");
-		if (pkt_dev->spi)
+		if (i == F_IPSEC && pkt_dev->spi)
 			seq_printf(seq, "spi:%u", pkt_dev->spi);
-	}
 #endif
-
-	if (pkt_dev->flags & F_MACSRC_RND)
-		seq_puts(seq, "MACSRC_RND  ");
-
-	if (pkt_dev->flags & F_MACDST_RND)
-		seq_puts(seq, "MACDST_RND  ");
-
-	if (pkt_dev->flags & F_VID_RND)
-		seq_puts(seq, "VID_RND  ");
-
-	if (pkt_dev->flags & F_SVID_RND)
-		seq_puts(seq, "SVID_RND  ");
-
-	if (pkt_dev->flags & F_NODE)
-		seq_puts(seq, "NODE_ALLOC  ");
+	}
 
 	seq_puts(seq, "\n");
 
-- 
cgit 


From 52e12d5daea48c92f0b0354f4cdc127a2c0a3c52 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Thu, 18 Jan 2018 18:31:37 +0000
Subject: pktgen: Clean read user supplied flag mess

Don't use error-prone-brute-force way.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 144 +++++++++++++++---------------------------------------
 1 file changed, 39 insertions(+), 105 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 17fb036d7297..b8ab5c829511 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -830,6 +830,35 @@ static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
 	return i;
 }
 
+static __u32 pktgen_read_flag(const char *f, bool *disable)
+{
+	__u32 i;
+
+	if (f[0] == '!') {
+		*disable = true;
+		f++;
+	}
+
+	for (i = 0; i < NR_PKT_FLAGS; i++) {
+		if (!IS_ENABLED(CONFIG_XFRM) && i == IPSEC_SHIFT)
+			continue;
+
+		/* allow only disabling ipv6 flag */
+		if (!*disable && i == IPV6_SHIFT)
+			continue;
+
+		if (strcmp(f, pkt_flag_names[i]) == 0)
+			return 1 << i;
+	}
+
+	if (strcmp(f, "FLOW_RND") == 0) {
+		*disable = !*disable;
+		return F_FLOW_SEQ;
+	}
+
+	return 0;
+}
+
 static ssize_t pktgen_if_write(struct file *file,
 			       const char __user * user_buffer, size_t count,
 			       loff_t * offset)
@@ -1187,7 +1216,10 @@ static ssize_t pktgen_if_write(struct file *file,
 		return count;
 	}
 	if (!strcmp(name, "flag")) {
+		__u32 flag;
 		char f[32];
+		bool disable = false;
+
 		memset(f, 0, 32);
 		len = strn_len(&user_buffer[i], sizeof(f) - 1);
 		if (len < 0)
@@ -1196,113 +1228,15 @@ static ssize_t pktgen_if_write(struct file *file,
 		if (copy_from_user(f, &user_buffer[i], len))
 			return -EFAULT;
 		i += len;
-		if (strcmp(f, "IPSRC_RND") == 0)
-			pkt_dev->flags |= F_IPSRC_RND;
-
-		else if (strcmp(f, "!IPSRC_RND") == 0)
-			pkt_dev->flags &= ~F_IPSRC_RND;
-
-		else if (strcmp(f, "TXSIZE_RND") == 0)
-			pkt_dev->flags |= F_TXSIZE_RND;
-
-		else if (strcmp(f, "!TXSIZE_RND") == 0)
-			pkt_dev->flags &= ~F_TXSIZE_RND;
-
-		else if (strcmp(f, "IPDST_RND") == 0)
-			pkt_dev->flags |= F_IPDST_RND;
-
-		else if (strcmp(f, "!IPDST_RND") == 0)
-			pkt_dev->flags &= ~F_IPDST_RND;
-
-		else if (strcmp(f, "UDPSRC_RND") == 0)
-			pkt_dev->flags |= F_UDPSRC_RND;
-
-		else if (strcmp(f, "!UDPSRC_RND") == 0)
-			pkt_dev->flags &= ~F_UDPSRC_RND;
-
-		else if (strcmp(f, "UDPDST_RND") == 0)
-			pkt_dev->flags |= F_UDPDST_RND;
-
-		else if (strcmp(f, "!UDPDST_RND") == 0)
-			pkt_dev->flags &= ~F_UDPDST_RND;
-
-		else if (strcmp(f, "MACSRC_RND") == 0)
-			pkt_dev->flags |= F_MACSRC_RND;
-
-		else if (strcmp(f, "!MACSRC_RND") == 0)
-			pkt_dev->flags &= ~F_MACSRC_RND;
 
-		else if (strcmp(f, "MACDST_RND") == 0)
-			pkt_dev->flags |= F_MACDST_RND;
+		flag = pktgen_read_flag(f, &disable);
 
-		else if (strcmp(f, "!MACDST_RND") == 0)
-			pkt_dev->flags &= ~F_MACDST_RND;
-
-		else if (strcmp(f, "MPLS_RND") == 0)
-			pkt_dev->flags |= F_MPLS_RND;
-
-		else if (strcmp(f, "!MPLS_RND") == 0)
-			pkt_dev->flags &= ~F_MPLS_RND;
-
-		else if (strcmp(f, "VID_RND") == 0)
-			pkt_dev->flags |= F_VID_RND;
-
-		else if (strcmp(f, "!VID_RND") == 0)
-			pkt_dev->flags &= ~F_VID_RND;
-
-		else if (strcmp(f, "SVID_RND") == 0)
-			pkt_dev->flags |= F_SVID_RND;
-
-		else if (strcmp(f, "!SVID_RND") == 0)
-			pkt_dev->flags &= ~F_SVID_RND;
-
-		else if (strcmp(f, "FLOW_SEQ") == 0 || strcmp(f, "!FLOW_RND") == 0)
-			pkt_dev->flags |= F_FLOW_SEQ;
-
-		else if (strcmp(f, "FLOW_RND") == 0 || strcmp(f, "!FLOW_SEQ") == 0)
-			pkt_dev->flags &= ~F_FLOW_SEQ;
-
-		else if (strcmp(f, "QUEUE_MAP_RND") == 0)
-			pkt_dev->flags |= F_QUEUE_MAP_RND;
-
-		else if (strcmp(f, "!QUEUE_MAP_RND") == 0)
-			pkt_dev->flags &= ~F_QUEUE_MAP_RND;
-
-		else if (strcmp(f, "QUEUE_MAP_CPU") == 0)
-			pkt_dev->flags |= F_QUEUE_MAP_CPU;
-
-		else if (strcmp(f, "!QUEUE_MAP_CPU") == 0)
-			pkt_dev->flags &= ~F_QUEUE_MAP_CPU;
-#ifdef CONFIG_XFRM
-		else if (strcmp(f, "IPSEC") == 0)
-			pkt_dev->flags |= F_IPSEC;
-
-		else if (strcmp(f, "!IPSEC") == 0)
-			pkt_dev->flags &= ~F_IPSEC;
-#endif
-
-		else if (strcmp(f, "!IPV6") == 0)
-			pkt_dev->flags &= ~F_IPV6;
-
-		else if (strcmp(f, "NODE_ALLOC") == 0)
-			pkt_dev->flags |= F_NODE;
-
-		else if (strcmp(f, "!NODE_ALLOC") == 0)
-			pkt_dev->flags &= ~F_NODE;
-
-		else if (strcmp(f, "UDPCSUM") == 0)
-			pkt_dev->flags |= F_UDPCSUM;
-
-		else if (strcmp(f, "!UDPCSUM") == 0)
-			pkt_dev->flags &= ~F_UDPCSUM;
-
-		else if (strcmp(f, "NO_TIMESTAMP") == 0)
-			pkt_dev->flags |= F_NO_TIMESTAMP;
-
-		else if (strcmp(f, "!NO_TIMESTAMP") == 0)
-			pkt_dev->flags &= ~F_NO_TIMESTAMP;
-
-		else {
+		if (flag) {
+			if (disable)
+				pkt_dev->flags &= ~flag;
+			else
+				pkt_dev->flags |= flag;
+		} else {
 			sprintf(pg_result,
 				"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
 				f,
-- 
cgit 


From 36fd633ec98acd2028585c22128fcaa3da6d5770 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jun 2017 13:19:16 -0400
Subject: net: separate SIOCGIFCONF handling from dev_ioctl()

Only two of dev_ioctl() callers may pass SIOCGIFCONF to it.
Separating that codepath from the rest of dev_ioctl() allows both
to simplify dev_ioctl() itself (all other cases work with struct ifreq *)
*and* seriously simplify the compat side of that beast: all it takes
is passing to inet_gifconf() an extra argument - the size of individual
records (sizeof(struct ifreq) or sizeof(struct compat_ifreq)).  With
dev_ifconf() called directly from sock_do_ioctl()/compat_dev_ifconf()
that's easy to arrange.

As the result, compat side of SIOCGIFCONF doesn't need any
allocations, copy_in_user() back and forth, etc.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/core/dev_ioctl.c | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 7e690d0ccd05..5cdec23dd28e 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -66,9 +66,8 @@ EXPORT_SYMBOL(register_gifconf);
  *	Thus we will need a 'compatibility mode'.
  */
 
-static int dev_ifconf(struct net *net, char __user *arg)
+int dev_ifconf(struct net *net, struct ifconf *ifc, int size)
 {
-	struct ifconf ifc;
 	struct net_device *dev;
 	char __user *pos;
 	int len;
@@ -79,11 +78,8 @@ static int dev_ifconf(struct net *net, char __user *arg)
 	 *	Fetch the caller's info block.
 	 */
 
-	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
-		return -EFAULT;
-
-	pos = ifc.ifc_buf;
-	len = ifc.ifc_len;
+	pos = ifc->ifc_buf;
+	len = ifc->ifc_len;
 
 	/*
 	 *	Loop over the interfaces, and write an info block for each.
@@ -95,10 +91,10 @@ static int dev_ifconf(struct net *net, char __user *arg)
 			if (gifconf_list[i]) {
 				int done;
 				if (!pos)
-					done = gifconf_list[i](dev, NULL, 0);
+					done = gifconf_list[i](dev, NULL, 0, size);
 				else
 					done = gifconf_list[i](dev, pos + total,
-							       len - total);
+							       len - total, size);
 				if (done < 0)
 					return -EFAULT;
 				total += done;
@@ -109,12 +105,12 @@ static int dev_ifconf(struct net *net, char __user *arg)
 	/*
 	 *	All done.  Write the updated control block back to the caller.
 	 */
-	ifc.ifc_len = total;
+	ifc->ifc_len = total;
 
 	/*
 	 * 	Both BSD and Solaris return 0 here, so we do too.
 	 */
-	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+	return 0;
 }
 
 /*
@@ -412,17 +408,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	int ret;
 	char *colon;
 
-	/* One special case: SIOCGIFCONF takes ifconf argument
-	   and requires shared lock, because it sleeps writing
-	   to user space.
-	 */
-
-	if (cmd == SIOCGIFCONF) {
-		rtnl_lock();
-		ret = dev_ifconf(net, (char __user *) arg);
-		rtnl_unlock();
-		return ret;
-	}
 	if (cmd == SIOCGIFNAME)
 		return dev_ifname(net, (struct ifreq __user *)arg);
 
-- 
cgit 


From b1b0c245067268043e0e832432f3d537a5cae33b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 1 Oct 2017 20:13:08 -0400
Subject: lift handling of SIOCIW... out of dev_ioctl()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/core/dev_ioctl.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 5cdec23dd28e..d262f159f9fd 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -411,24 +411,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	if (cmd == SIOCGIFNAME)
 		return dev_ifname(net, (struct ifreq __user *)arg);
 
-	/*
-	 * Take care of Wireless Extensions. Unfortunately struct iwreq
-	 * isn't a proper subset of struct ifreq (it's 8 byte shorter)
-	 * so we need to treat it specially, otherwise applications may
-	 * fault if the struct they're passing happens to land at the
-	 * end of a mapped page.
-	 */
-	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
-		struct iwreq iwr;
-
-		if (copy_from_user(&iwr, arg, sizeof(iwr)))
-			return -EFAULT;
-
-		iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0;
-
-		return wext_handle_ioctl(net, &iwr, cmd, arg);
-	}
-
 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
 		return -EFAULT;
 
-- 
cgit 


From 44c02a2c3dc55835e9f0d8ef73966406cd805001 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 5 Oct 2017 12:59:44 -0400
Subject: dev_ioctl(): move copyin/copyout to callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/core/dev_ioctl.c | 85 +++++++++++++++-------------------------------------
 1 file changed, 24 insertions(+), 61 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index d262f159f9fd..0ab1af04296c 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -18,26 +18,10 @@
  *	match.  --pb
  */
 
-static int dev_ifname(struct net *net, struct ifreq __user *arg)
+static int dev_ifname(struct net *net, struct ifreq *ifr)
 {
-	struct ifreq ifr;
-	int error;
-
-	/*
-	 *	Fetch the caller's info block.
-	 */
-
-	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
-		return -EFAULT;
-	ifr.ifr_name[IFNAMSIZ-1] = 0;
-
-	error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex);
-	if (error)
-		return error;
-
-	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
-		return -EFAULT;
-	return 0;
+	ifr->ifr_name[IFNAMSIZ-1] = 0;
+	return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex);
 }
 
 static gifconf_func_t *gifconf_list[NPROTO];
@@ -402,24 +386,24 @@ EXPORT_SYMBOL(dev_load);
  *	positive or a negative errno code on error.
  */
 
-int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout)
 {
-	struct ifreq ifr;
 	int ret;
 	char *colon;
 
+	if (need_copyout)
+		*need_copyout = true;
 	if (cmd == SIOCGIFNAME)
-		return dev_ifname(net, (struct ifreq __user *)arg);
-
-	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
-		return -EFAULT;
+		return dev_ifname(net, ifr);
 
-	ifr.ifr_name[IFNAMSIZ-1] = 0;
+	ifr->ifr_name[IFNAMSIZ-1] = 0;
 
-	colon = strchr(ifr.ifr_name, ':');
+	colon = strchr(ifr->ifr_name, ':');
 	if (colon)
 		*colon = 0;
 
+	dev_load(net, ifr->ifr_name);
+
 	/*
 	 *	See which interface the caller is talking about.
 	 */
@@ -439,31 +423,19 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCGIFMAP:
 	case SIOCGIFINDEX:
 	case SIOCGIFTXQLEN:
-		dev_load(net, ifr.ifr_name);
 		rcu_read_lock();
-		ret = dev_ifsioc_locked(net, &ifr, cmd);
+		ret = dev_ifsioc_locked(net, ifr, cmd);
 		rcu_read_unlock();
-		if (!ret) {
-			if (colon)
-				*colon = ':';
-			if (copy_to_user(arg, &ifr,
-					 sizeof(struct ifreq)))
-				ret = -EFAULT;
-		}
+		if (colon)
+			*colon = ':';
 		return ret;
 
 	case SIOCETHTOOL:
-		dev_load(net, ifr.ifr_name);
 		rtnl_lock();
-		ret = dev_ethtool(net, &ifr);
+		ret = dev_ethtool(net, ifr);
 		rtnl_unlock();
-		if (!ret) {
-			if (colon)
-				*colon = ':';
-			if (copy_to_user(arg, &ifr,
-					 sizeof(struct ifreq)))
-				ret = -EFAULT;
-		}
+		if (colon)
+			*colon = ':';
 		return ret;
 
 	/*
@@ -477,17 +449,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCSIFNAME:
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
-		dev_load(net, ifr.ifr_name);
 		rtnl_lock();
-		ret = dev_ifsioc(net, &ifr, cmd);
+		ret = dev_ifsioc(net, ifr, cmd);
 		rtnl_unlock();
-		if (!ret) {
-			if (colon)
-				*colon = ':';
-			if (copy_to_user(arg, &ifr,
-					 sizeof(struct ifreq)))
-				ret = -EFAULT;
-		}
+		if (colon)
+			*colon = ':';
 		return ret;
 
 	/*
@@ -528,10 +494,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		/* fall through */
 	case SIOCBONDSLAVEINFOQUERY:
 	case SIOCBONDINFOQUERY:
-		dev_load(net, ifr.ifr_name);
 		rtnl_lock();
-		ret = dev_ifsioc(net, &ifr, cmd);
+		ret = dev_ifsioc(net, ifr, cmd);
 		rtnl_unlock();
+		if (need_copyout)
+			*need_copyout = false;
 		return ret;
 
 	case SIOCGIFMEM:
@@ -551,13 +518,9 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		    cmd == SIOCGHWTSTAMP ||
 		    (cmd >= SIOCDEVPRIVATE &&
 		     cmd <= SIOCDEVPRIVATE + 15)) {
-			dev_load(net, ifr.ifr_name);
 			rtnl_lock();
-			ret = dev_ifsioc(net, &ifr, cmd);
+			ret = dev_ifsioc(net, ifr, cmd);
 			rtnl_unlock();
-			if (!ret && copy_to_user(arg, &ifr,
-						 sizeof(struct ifreq)))
-				ret = -EFAULT;
 			return ret;
 		}
 		return -ENOTTY;
-- 
cgit 


From fb07a820fe3fedabffc57863e0f823c912d81bad Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 19 Jan 2018 19:14:53 +0300
Subject: net: Move net:netns_ids destruction out of rtnl_lock() and document
 locking scheme

Currently, we unhash a dying net from netns_ids lists
under rtnl_lock(). It's a leftover from the time when
net::netns_ids was introduced. There was no net::nsid_lock,
and rtnl_lock() was mostly need to order modification
of alive nets nsid idr, i.e. for:
	for_each_net(tmp) {
		...
		id = __peernet2id(tmp, net);
		idr_remove(&tmp->netns_ids, id);
		...
	}

Since we have net::nsid_lock, the modifications are
protected by this local lock, and now we may introduce
better scheme of netns_ids destruction.

Let's look at the functions peernet2id_alloc() and
get_net_ns_by_id(). Previous commits taught these
functions to work well with dying net acquired from
rtnl unlocked lists. And they are the only functions
which can hash a net to netns_ids or obtain from there.
And as easy to check, other netns_ids operating functions
works with id, not with net pointers. So, we do not
need rtnl_lock to synchronize cleanup_net() with all them.

The another property, which is used in the patch,
is that net is unhashed from net_namespace_list
in the only place and by the only process. So,
we avoid excess rcu_read_lock() or rtnl_lock(),
when we'are iterating over the list in unhash_nsid().

All the above makes possible to keep rtnl_lock() locked
only for net->list deletion, and completely avoid it
for netns_ids unhashing and destruction. As these two
doings may take long time (e.g., memory allocation
to send skb), the patch should positively act on
the scalability and signify decrease the time, which
rtnl_lock() is held in cleanup_net().

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 62 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 18 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1ccb953b3b09..3cad5f51afd3 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -439,13 +439,40 @@ struct net *copy_net_ns(unsigned long flags,
 	return net;
 }
 
+static void unhash_nsid(struct net *net, struct net *last)
+{
+	struct net *tmp;
+	/* This function is only called from cleanup_net() work,
+	 * and this work is the only process, that may delete
+	 * a net from net_namespace_list. So, when the below
+	 * is executing, the list may only grow. Thus, we do not
+	 * use for_each_net_rcu() or rtnl_lock().
+	 */
+	for_each_net(tmp) {
+		int id;
+
+		spin_lock_bh(&tmp->nsid_lock);
+		id = __peernet2id(tmp, net);
+		if (id >= 0)
+			idr_remove(&tmp->netns_ids, id);
+		spin_unlock_bh(&tmp->nsid_lock);
+		if (id >= 0)
+			rtnl_net_notifyid(tmp, RTM_DELNSID, id);
+		if (tmp == last)
+			break;
+	}
+	spin_lock_bh(&net->nsid_lock);
+	idr_destroy(&net->netns_ids);
+	spin_unlock_bh(&net->nsid_lock);
+}
+
 static DEFINE_SPINLOCK(cleanup_list_lock);
 static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
 
 static void cleanup_net(struct work_struct *work)
 {
 	const struct pernet_operations *ops;
-	struct net *net, *tmp;
+	struct net *net, *tmp, *last;
 	struct list_head net_kill_list;
 	LIST_HEAD(net_exit_list);
 
@@ -458,26 +485,25 @@ static void cleanup_net(struct work_struct *work)
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
-	list_for_each_entry(net, &net_kill_list, cleanup_list) {
+	list_for_each_entry(net, &net_kill_list, cleanup_list)
 		list_del_rcu(&net->list);
-		list_add_tail(&net->exit_list, &net_exit_list);
-		for_each_net(tmp) {
-			int id;
-
-			spin_lock_bh(&tmp->nsid_lock);
-			id = __peernet2id(tmp, net);
-			if (id >= 0)
-				idr_remove(&tmp->netns_ids, id);
-			spin_unlock_bh(&tmp->nsid_lock);
-			if (id >= 0)
-				rtnl_net_notifyid(tmp, RTM_DELNSID, id);
-		}
-		spin_lock_bh(&net->nsid_lock);
-		idr_destroy(&net->netns_ids);
-		spin_unlock_bh(&net->nsid_lock);
+	/* Cache last net. After we unlock rtnl, no one new net
+	 * added to net_namespace_list can assign nsid pointer
+	 * to a net from net_kill_list (see peernet2id_alloc()).
+	 * So, we skip them in unhash_nsid().
+	 *
+	 * Note, that unhash_nsid() does not delete nsid links
+	 * between net_kill_list's nets, as they've already
+	 * deleted from net_namespace_list. But, this would be
+	 * useless anyway, as netns_ids are destroyed there.
+	 */
+	last = list_last_entry(&net_namespace_list, struct net, list);
+	rtnl_unlock();
 
+	list_for_each_entry(net, &net_kill_list, cleanup_list) {
+		unhash_nsid(net, last);
+		list_add_tail(&net->exit_list, &net_exit_list);
 	}
-	rtnl_unlock();
 
 	/*
 	 * Another CPU might be rcu-iterating the list, wait for it.
-- 
cgit 


From 2585cd62f0986a6e6d9c83363ed6dbcc66bc9f32 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:05 -0800
Subject: bpf: Only reply field should be writeable

Currently, a sock_ops BPF program can write the op field and all the
reply fields (reply and replylong). This is a bug. The op field should
not have been writeable and there is currently no way to use replylong
field for indices >= 1. This patch enforces that only the reply field
(which equals replylong[0]) is writeable.

Fixes: 40304b2a1567 ("bpf: BPF support for sock_ops")
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 18da42a81d0c..bf9bb755e369 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3845,8 +3845,7 @@ static bool sock_ops_is_valid_access(int off, int size,
 {
 	if (type == BPF_WRITE) {
 		switch (off) {
-		case offsetof(struct bpf_sock_ops, op) ...
-		     offsetof(struct bpf_sock_ops, replylong[3]):
+		case offsetof(struct bpf_sock_ops, reply):
 			break;
 		default:
 			return false;
-- 
cgit 


From a33de3973488680def6a900e74ec67ba2bef226c Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:06 -0800
Subject: bpf: Make SOCK_OPS_GET_TCP size independent

Make SOCK_OPS_GET_TCP helper macro size independent (before only worked
with 4-byte fields.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index bf9bb755e369..62e78748d95f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4470,9 +4470,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 /* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+#define SOCK_OPS_GET_TCP(FIELD_NAME)					      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) >      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
 						is_fullsock),		      \
@@ -4484,16 +4485,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
+		*insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock,	      \
+						   FIELD_NAME), si->dst_reg,  \
+				      si->dst_reg,			      \
 				      offsetof(struct tcp_sock, FIELD_NAME)); \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP32(snd_cwnd);
+		SOCK_OPS_GET_TCP(snd_cwnd);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP32(srtt_us);
+		SOCK_OPS_GET_TCP(srtt_us);
 		break;
 	}
 	return insn - insn_buf;
-- 
cgit 


From 34d367c59233464dbd1f07445c4665099a7128ec Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:07 -0800
Subject: bpf: Make SOCK_OPS_GET_TCP struct independent

Changed SOCK_OPS_GET_TCP to SOCK_OPS_GET_FIELD and added 2
arguments so now it can also work with struct sock fields.
The first argument is the name of the field in the bpf_sock_ops
struct, the 2nd argument is the name of the field in the OBJ struct.

Previous: SOCK_OPS_GET_TCP(FIELD_NAME)
New:      SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)

Where OBJ is either "struct tcp_sock" or "struct sock" (without
quotation). BPF_FIELD is the name of the field in the bpf_sock_ops
struct and OBJ_FIELD is the name of the field in the OBJ struct.

Although the field names are currently the same, the kernel struct names
could change in the future and this change makes it easier to support
that.

Note that adding access to tcp_sock fields in sock_ops programs does
not preclude the tcp_sock fields from being removed as long as we are
willing to do one of the following:

  1) Return a fixed value (e.x. 0 or 0xffffffff), or
  2) Make the verifier fail if that field is accessed (i.e. program
    fails to load) so the user will know that field is no longer
    supported.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 62e78748d95f..dbb6d2f60680 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4469,11 +4469,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
-/* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP(FIELD_NAME)					      \
+/* Helper macro for adding read access to tcp_sock or sock fields. */
+#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) >      \
-			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
 						is_fullsock),		      \
@@ -4485,18 +4485,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock,	      \
-						   FIELD_NAME), si->dst_reg,  \
-				      si->dst_reg,			      \
-				      offsetof(struct tcp_sock, FIELD_NAME)); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,		      \
+						       OBJ_FIELD),	      \
+				      si->dst_reg, si->dst_reg,		      \
+				      offsetof(OBJ, OBJ_FIELD));	      \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP(snd_cwnd);
+		SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP(srtt_us);
+		SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
 		break;
 	}
 	return insn - insn_buf;
-- 
cgit 


From b73042b8a28e2603ac178295ab96c876ba5a97a1 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:08 -0800
Subject: bpf: Add write access to tcp_sock and sock fields

This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to
struct tcp_sock or struct sock fields. This required adding a new
field "temp" to struct bpf_sock_ops_kern for temporary storage that
is used by sock_ops_convert_ctx_access. It is used to store and recover
the contents of a register, so the register can be used to store the
address of the sk. Since we cannot overwrite the dst_reg because it
contains the pointer to ctx, nor the src_reg since it contains the value
we want to store, we need an extra register to contain the address
of the sk.

Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the
GET or SET macros depending on the value of the TYPE field.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index dbb6d2f60680..c356ec02b1a5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4491,6 +4491,54 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(OBJ, OBJ_FIELD));	      \
 	} while (0)
 
+/* Helper macro for adding write access to tcp_sock or sock fields.
+ * The macro is called with two registers, dst_reg which contains a pointer
+ * to ctx (context) and src_reg which contains the value that should be
+ * stored. However, we need an additional register since we cannot overwrite
+ * dst_reg because it may be used later in the program.
+ * Instead we "borrow" one of the other register. We first save its value
+ * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
+ * it at the end of the macro.
+ */
+#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
+	do {								      \
+		int reg = BPF_REG_9;					      \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \
+		if (si->dst_reg == reg || si->src_reg == reg)		      \
+			reg--;						      \
+		if (si->dst_reg == reg || si->src_reg == reg)		      \
+			reg--;						      \
+		*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);		      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD),	      \
+				      reg, si->src_reg,			      \
+				      offsetof(OBJ, OBJ_FIELD));	      \
+		*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+	} while (0)
+
+#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)	      \
+	do {								      \
+		if (TYPE == BPF_WRITE)					      \
+			SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \
+		else							      \
+			SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \
+	} while (0)
+
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
 		SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock);
 		break;
-- 
cgit 


From b13d880721729384757f235166068c315326f4a1 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:10 -0800
Subject: bpf: Adds field bpf_sock_ops_cb_flags to tcp_sock

Adds field bpf_sock_ops_cb_flags to tcp_sock and bpf_sock_ops. Its primary
use is to determine if there should be calls to sock_ops bpf program at
various points in the TCP code. The field is initialized to zero,
disabling the calls. A sock_ops BPF program can set it, per connection and
as necessary, when the connection is established.

It also adds support for reading and writting the field within a
sock_ops BPF program. Reading is done by accessing the field directly.
However, writing is done through the helper function
bpf_sock_ops_cb_flags_set, in order to return an error if a BPF program
is trying to set a callback that is not supported in the current kernel
(i.e. running an older kernel). The helper function returns 0 if it was
able to set all of the bits set in the argument, a positive number
containing the bits that could not be set, or -EINVAL if the socket is
not a full TCP socket.

Examples of where one could call the bpf program:

1) When RTO fires
2) When a packet is retransmitted
3) When the connection terminates
4) When a packet is sent
5) When a packet is received

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index c356ec02b1a5..6936d19ac736 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3328,6 +3328,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
+	   int, argval)
+{
+	struct sock *sk = bpf_sock->sk;
+	int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+
+	if (!sk_fullsock(sk))
+		return -EINVAL;
+
+#ifdef CONFIG_INET
+	if (val)
+		tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+
+	return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
+#else
+	return -EINVAL;
+#endif
+}
+
+static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
+	.func		= bpf_sock_ops_cb_flags_set,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3510,6 +3537,8 @@ static const struct bpf_func_proto *
 		return &bpf_setsockopt_proto;
 	case BPF_FUNC_getsockopt:
 		return &bpf_getsockopt_proto;
+	case BPF_FUNC_sock_ops_cb_flags_set:
+		return &bpf_sock_ops_cb_flags_set_proto;
 	case BPF_FUNC_sock_map_update:
 		return &bpf_sock_map_update_proto;
 	default:
@@ -4546,6 +4575,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sock_ops, srtt_us):
 		SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
+		SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
+				   struct tcp_sock);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
cgit 


From 44f0e43037d3a17b043843ba67610ac7c7e37db6 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:12 -0800
Subject: bpf: Add support for reading sk_state and more

Add support for reading many more tcp_sock fields

  state,	same as sk->sk_state
  rtt_min	same as sk->rtt_min.s[0].v (current rtt_min)
  snd_ssthresh
  rcv_nxt
  snd_nxt
  snd_una
  mss_cache
  ecn_flags
  rate_delivered
  rate_interval_us
  packets_out
  retrans_out
  total_retrans
  segs_in
  data_segs_in
  segs_out
  data_segs_out
  lost_out
  sacked_out
  sk_txhash
  bytes_received (__u64)
  bytes_acked    (__u64)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 132 insertions(+), 11 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 6936d19ac736..a858ebc4ece4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3855,33 +3855,43 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-static bool __is_valid_sock_ops_access(int off, int size)
+static bool sock_ops_is_valid_access(int off, int size,
+				     enum bpf_access_type type,
+				     struct bpf_insn_access_aux *info)
 {
+	const int size_default = sizeof(__u32);
+
 	if (off < 0 || off >= sizeof(struct bpf_sock_ops))
 		return false;
+
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
-		return false;
-
-	return true;
-}
 
-static bool sock_ops_is_valid_access(int off, int size,
-				     enum bpf_access_type type,
-				     struct bpf_insn_access_aux *info)
-{
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct bpf_sock_ops, reply):
+			if (size != size_default)
+				return false;
 			break;
 		default:
 			return false;
 		}
+	} else {
+		switch (off) {
+		case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
+					bytes_acked):
+			if (size != sizeof(__u64))
+				return false;
+			break;
+		default:
+			if (size != size_default)
+				return false;
+			break;
+		}
 	}
 
-	return __is_valid_sock_ops_access(off, size);
+	return true;
 }
 
 static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
@@ -4498,6 +4508,32 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
+	case offsetof(struct bpf_sock_ops, state):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_state));
+		break;
+
+	case offsetof(struct bpf_sock_ops, rtt_min):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+			     sizeof(struct minmax));
+		BUILD_BUG_ON(sizeof(struct minmax) <
+			     sizeof(struct minmax_sample));
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct tcp_sock, rtt_min) +
+				      FIELD_SIZEOF(struct minmax_sample, t));
+		break;
+
 /* Helper macro for adding read access to tcp_sock or sock fields. */
 #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
 	do {								      \
@@ -4580,6 +4616,91 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
 				   struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, snd_ssthresh):
+		SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rcv_nxt):
+		SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_nxt):
+		SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_una):
+		SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, mss_cache):
+		SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, ecn_flags):
+		SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_delivered):
+		SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_interval_us):
+		SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, packets_out):
+		SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, retrans_out):
+		SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, total_retrans):
+		SOCK_OPS_GET_FIELD(total_retrans, total_retrans,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_in):
+		SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_in):
+		SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_out):
+		SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_out):
+		SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, lost_out):
+		SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, sacked_out):
+		SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, sk_txhash):
+		SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_received):
+		SOCK_OPS_GET_FIELD(bytes_received, bytes_received,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_acked):
+		SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
cgit 


From 6f9bd3d731aac0d2ac21dd78a642af5df38fb5c5 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:13 -0800
Subject: bpf: Add sock_ops R/W access to tclass

Adds direct write access to sk_txhash and access to tclass for ipv6
flows through getsockopt and setsockopt. Sample usage for tclass:

  bpf_getsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, sizeof(v))

where skops is a pointer to the ctx (struct bpf_sock_ops).

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index a858ebc4ece4..fe2c7937351f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3232,6 +3232,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			ret = -EINVAL;
 		}
 #ifdef CONFIG_INET
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (level == SOL_IPV6) {
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+			return -EINVAL;
+
+		val = *((int *)optval);
+		/* Only some options are supported */
+		switch (optname) {
+		case IPV6_TCLASS:
+			if (val < -1 || val > 0xff) {
+				ret = -EINVAL;
+			} else {
+				struct ipv6_pinfo *np = inet6_sk(sk);
+
+				if (val == -1)
+					val = 0;
+				np->tclass = val;
+			}
+			break;
+		default:
+			ret = -EINVAL;
+		}
+#endif
 	} else if (level == SOL_TCP &&
 		   sk->sk_prot->setsockopt == tcp_setsockopt) {
 		if (optname == TCP_CONGESTION) {
@@ -3241,7 +3264,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			strncpy(name, optval, min_t(long, optlen,
 						    TCP_CA_NAME_MAX-1));
 			name[TCP_CA_NAME_MAX-1] = 0;
-			ret = tcp_set_congestion_control(sk, name, false, reinit);
+			ret = tcp_set_congestion_control(sk, name, false,
+							 reinit);
 		} else {
 			struct tcp_sock *tp = tcp_sk(sk);
 
@@ -3307,6 +3331,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		} else {
 			goto err_clear;
 		}
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (level == SOL_IPV6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+			goto err_clear;
+
+		/* Only some options are supported */
+		switch (optname) {
+		case IPV6_TCLASS:
+			*((int *)optval) = (int)np->tclass;
+			break;
+		default:
+			goto err_clear;
+		}
+#endif
 	} else {
 		goto err_clear;
 	}
@@ -3871,6 +3911,7 @@ static bool sock_ops_is_valid_access(int off, int size,
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct bpf_sock_ops, reply):
+		case offsetof(struct bpf_sock_ops, sk_txhash):
 			if (size != size_default)
 				return false;
 			break;
@@ -4690,7 +4731,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 	case offsetof(struct bpf_sock_ops, sk_txhash):
-		SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock);
+		SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
+					  struct sock, type);
 		break;
 
 	case offsetof(struct bpf_sock_ops, bytes_received):
@@ -4701,6 +4743,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sock_ops, bytes_acked):
 		SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
 		break;
+
 	}
 	return insn - insn_buf;
 }
-- 
cgit 


From 1d621674d923790d09cab9e2c7da7da6446a6257 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 26 Jan 2018 23:33:36 +0100
Subject: bpf: xor of a/x in cbpf can be done in 32 bit alu

Very minor optimization; saves 1 byte per program in x86_64
JIT in cBPF prologue.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index fe2c7937351f..60d8c8712652 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -401,8 +401,8 @@ do_pass:
 		/* Classic BPF expects A and X to be reset first. These need
 		 * to be guaranteed to be the first two instructions.
 		 */
-		*new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
-		*new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
+		*new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+		*new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
 
 		/* All programs must keep CTX in callee saved BPF_REG_CTX.
 		 * In eBPF case it's done by the compiler, here we need to
-- 
cgit 


From f6b1b3bf0d5f681631a293cfe1ca934b81716f1e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 26 Jan 2018 23:33:39 +0100
Subject: bpf: fix subprog verifier bypass by div/mod by 0 exception

One of the ugly leftovers from the early eBPF days is that div/mod
operations based on registers have a hard-coded src_reg == 0 test
in the interpreter as well as in JIT code generators that would
return from the BPF program with exit code 0. This was basically
adopted from cBPF interpreter for historical reasons.

There are multiple reasons why this is very suboptimal and prone
to bugs. To name one: the return code mapping for such abnormal
program exit of 0 does not always match with a suitable program
type's exit code mapping. For example, '0' in tc means action 'ok'
where the packet gets passed further up the stack, which is just
undesirable for such cases (e.g. when implementing policy) and
also does not match with other program types.

While trying to work out an exception handling scheme, I also
noticed that programs crafted like the following will currently
pass the verifier:

  0: (bf) r6 = r1
  1: (85) call pc+8
  caller:
   R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1
  callee:
   frame1: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_1
  10: (b4) (u32) r2 = (u32) 0
  11: (b4) (u32) r3 = (u32) 1
  12: (3c) (u32) r3 /= (u32) r2
  13: (61) r0 = *(u32 *)(r1 +76)
  14: (95) exit
  returning from callee:
   frame1: R0_w=pkt(id=0,off=0,r=0,imm=0)
           R1=ctx(id=0,off=0,imm=0) R2_w=inv0
           R3_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
           R10=fp0,call_1
  to caller at 2:
   R0_w=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0)
   R10=fp0,call_-1

  from 14 to 2: R0=pkt(id=0,off=0,r=0,imm=0)
                R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1
  2: (bf) r1 = r6
  3: (61) r1 = *(u32 *)(r1 +80)
  4: (bf) r2 = r0
  5: (07) r2 += 8
  6: (2d) if r2 > r1 goto pc+1
   R0=pkt(id=0,off=0,r=8,imm=0) R1=pkt_end(id=0,off=0,imm=0)
   R2=pkt(id=0,off=8,r=8,imm=0) R6=ctx(id=0,off=0,imm=0)
   R10=fp0,call_-1
  7: (71) r0 = *(u8 *)(r0 +0)
  8: (b7) r0 = 1
  9: (95) exit

  from 6 to 8: safe
  processed 16 insns (limit 131072), stack depth 0+0

Basically what happens is that in the subprog we make use of a
div/mod by 0 exception and in the 'normal' subprog's exit path
we just return skb->data back to the main prog. This has the
implication that the verifier thinks we always get a pkt pointer
in R0 while we still have the implicit 'return 0' from the div
as an alternative unconditional return path earlier. Thus, R0
then contains 0, meaning back in the parent prog we get the
address range of [0x0, skb->data_end] as read and writeable.
Similar can be crafted with other pointer register types.

Since i) BPF_ABS/IND is not allowed in programs that contain
BPF to BPF calls (and generally it's also disadvised to use in
native eBPF context), ii) unknown opcodes don't return zero
anymore, iii) we don't return an exception code in dead branches,
the only last missing case affected and to fix is the div/mod
handling.

What we would really need is some infrastructure to propagate
exceptions all the way to the original prog unwinding the
current stack and returning that code to the caller of the
BPF program. In user space such exception handling for similar
runtimes is typically implemented with setjmp(3) and longjmp(3)
as one possibility which is not available in the kernel,
though (kgdb used to implement it in kernel long time ago). I
implemented a PoC exception handling mechanism into the BPF
interpreter with porting setjmp()/longjmp() into x86_64 and
adding a new internal BPF_ABRT opcode that can use a program
specific exception code for all exception cases we have (e.g.
div/mod by 0, unknown opcodes, etc). While this seems to work
in the constrained BPF environment (meaning, here, we don't
need to deal with state e.g. from memory allocations that we
would need to undo before going into exception state), it still
has various drawbacks: i) we would need to implement the
setjmp()/longjmp() for every arch supported in the kernel and
for x86_64, arm64, sparc64 JITs currently supporting calls,
ii) it has unconditional additional cost on main program
entry to store CPU register state in initial setjmp() call,
and we would need some way to pass the jmp_buf down into
___bpf_prog_run() for main prog and all subprogs, but also
storing on stack is not really nice (other option would be
per-cpu storage for this, but it also has the drawback that
we need to disable preemption for every BPF program types).
All in all this approach would add a lot of complexity.

Another poor-man's solution would be to have some sort of
additional shared register or scratch buffer to hold state
for exceptions, and test that after every call return to
chain returns and pass R0 all the way down to BPF prog caller.
This is also problematic in various ways: i) an additional
register doesn't map well into JITs, and some other scratch
space could only be on per-cpu storage, which, again has the
side-effect that this only works when we disable preemption,
or somewhere in the input context which is not available
everywhere either, and ii) this adds significant runtime
overhead by putting conditionals after each and every call,
as well as implementation complexity.

Yet another option is to teach verifier that div/mod can
return an integer, which however is also complex to implement
as verifier would need to walk such fake 'mov r0,<code>; exit;'
sequeuence and there would still be no guarantee for having
propagation of this further down to the BPF caller as proper
exception code. For parent prog, it is also is not distinguishable
from a normal return of a constant scalar value.

The approach taken here is a completely different one with
little complexity and no additional overhead involved in
that we make use of the fact that a div/mod by 0 is undefined
behavior. Instead of bailing out, we adapt the same behavior
as on some major archs like ARMv8 [0] into eBPF as well:
X div 0 results in 0, and X mod 0 results in X. aarch64 and
aarch32 ISA do not generate any traps or otherwise aborts
of program execution for unsigned divides. I verified this
also with a test program compiled by gcc and clang, and the
behavior matches with the spec. Going forward we adapt the
eBPF verifier to emit such rewrites once div/mod by register
was seen. cBPF is not touched and will keep existing 'return 0'
semantics. Given the options, it seems the most suitable from
all of them, also since major archs have similar schemes in
place. Given this is all in the realm of undefined behavior,
we still have the option to adapt if deemed necessary and
this way we would also have the option of more flexibility
from LLVM code generation side (which is then fully visible
to verifier). Thus, this patch i) fixes the panic seen in
above program and ii) doesn't bypass the verifier observations.

  [0] ARM Architecture Reference Manual, ARMv8 [ARM DDI 0487B.b]
      http://infocenter.arm.com/help/topic/com.arm.doc.ddi0487b.b/DDI0487B_b_armv8_arm.pdf
      1) aarch64 instruction set: section C3.4.7 and C6.2.279 (UDIV)
         "A division by zero results in a zero being written to
          the destination register, without any indication that
          the division by zero occurred."
      2) aarch32 instruction set: section F1.4.8 and F5.1.263 (UDIV)
         "For the SDIV and UDIV instructions, division by zero
          always returns a zero result."

Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 60d8c8712652..08ab4c65a998 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -459,8 +459,15 @@ do_pass:
 				break;
 
 			if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
-			    fp->code == (BPF_ALU | BPF_MOD | BPF_X))
+			    fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
 				*insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
+				/* Error with exception code on div/mod by 0.
+				 * For cBPF programs, this was always return 0.
+				 */
+				*insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
+				*insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+				*insn++ = BPF_EXIT_INSN();
+			}
 
 			*insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
 			break;
-- 
cgit 


From 7c4f63ba824302492985553018881455982241d6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christianvanbrauner@gmail.com>
Date: Wed, 24 Jan 2018 15:26:32 +0100
Subject: rtnetlink: enable IFLA_IF_NETNSID in do_setlink()

RTM_{NEW,SET}LINK already allow operations on other network namespaces
by identifying the target network namespace through IFLA_NET_NS_{FD,PID}
properties. This is done by looking for the corresponding properties in
do_setlink(). Extend do_setlink() to also look for the IFLA_IF_NETNSID
property. This introduces no functional changes since all callers of
do_setlink() currently block IFLA_IF_NETNSID by reporting an error before
they reach do_setlink().

This introduces the helpers:

static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct
                                               nlattr *tb[])

static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb,
                                             struct net *src_net,
					     struct nlattr *tb[], int cap)

to simplify permission checks and target network namespace retrieval for
RTM_* requests that already support IFLA_NET_NS_{FD,PID} but get extended
to IFLA_IF_NETNSID. To perserve backwards compatibility the helpers look
for IFLA_NET_NS_{FD,PID} properties first before checking for
IFLA_IF_NETNSID.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 54 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 97874daa1336..f7e99c25dfe4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1902,6 +1902,49 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
 }
 EXPORT_SYMBOL(rtnl_link_get_net);
 
+/* Figure out which network namespace we are talking about by
+ * examining the link attributes in the following order:
+ *
+ * 1. IFLA_NET_NS_PID
+ * 2. IFLA_NET_NS_FD
+ * 3. IFLA_IF_NETNSID
+ */
+static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
+					       struct nlattr *tb[])
+{
+	struct net *net;
+
+	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])
+		return rtnl_link_get_net(src_net, tb);
+
+	if (!tb[IFLA_IF_NETNSID])
+		return get_net(src_net);
+
+	net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID]));
+	if (!net)
+		return ERR_PTR(-EINVAL);
+
+	return net;
+}
+
+static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb,
+					     struct net *src_net,
+					     struct nlattr *tb[], int cap)
+{
+	struct net *net;
+
+	net = rtnl_link_get_net_by_nlattr(src_net, tb);
+	if (IS_ERR(net))
+		return net;
+
+	if (!netlink_ns_capable(skb, net->user_ns, cap)) {
+		put_net(net);
+		return ERR_PTR(-EPERM);
+	}
+
+	return net;
+}
+
 static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 {
 	if (dev) {
@@ -2164,17 +2207,14 @@ static int do_setlink(const struct sk_buff *skb,
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int err;
 
-	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
-		struct net *net = rtnl_link_get_net(dev_net(dev), tb);
+	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
+		struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
+							    tb, CAP_NET_ADMIN);
 		if (IS_ERR(net)) {
 			err = PTR_ERR(net);
 			goto errout;
 		}
-		if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
-			put_net(net);
-			err = -EPERM;
-			goto errout;
-		}
+
 		err = dev_change_net_namespace(dev, net, ifname);
 		put_net(net);
 		if (err)
-- 
cgit 


From c310bfcb6e1be993629c5747accf8e1c65fbb255 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christianvanbrauner@gmail.com>
Date: Wed, 24 Jan 2018 15:26:33 +0100
Subject: rtnetlink: enable IFLA_IF_NETNSID for RTM_SETLINK

- Backwards Compatibility:
  If userspace wants to determine whether RTM_SETLINK supports the
  IFLA_IF_NETNSID property they should first send an RTM_GETLINK request
  with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply
  does not include IFLA_IF_NETNSID userspace should assume that
  IFLA_IF_NETNSID is not supported on this kernel.
  If the reply does contain an IFLA_IF_NETNSID property userspace
  can send an RTM_SETLINK with a IFLA_IF_NETNSID property. If they receive
  EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property
  with RTM_SETLINK. Userpace should then fallback to other means.

  To retain backwards compatibility the kernel will first check whether a
  IFLA_NET_NS_PID or IFLA_NET_NS_FD property has been passed. If either
  one is found it will be used to identify the target network namespace.
  This implies that users who do not care whether their running kernel
  supports IFLA_IF_NETNSID with RTM_SETLINK can pass both
  IFLA_NET_NS_{FD,PID} and IFLA_IF_NETNSID referring to the same network
  namespace.

- Security:
  Callers must have CAP_NET_ADMIN in the owning user namespace of the
  target network namespace.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f7e99c25dfe4..d0c02943c05a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2555,9 +2555,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout;
 
-	if (tb[IFLA_IF_NETNSID])
-		return -EOPNOTSUPP;
-
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
-- 
cgit 


From b61ad68a9fe85d29d5363eb36860164a049723cf Mon Sep 17 00:00:00 2001
From: Christian Brauner <christianvanbrauner@gmail.com>
Date: Wed, 24 Jan 2018 15:26:34 +0100
Subject: rtnetlink: enable IFLA_IF_NETNSID for RTM_DELLINK

- Backwards Compatibility:
  If userspace wants to determine whether RTM_DELLINK supports the
  IFLA_IF_NETNSID property they should first send an RTM_GETLINK request
  with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply
  does not include IFLA_IF_NETNSID userspace should assume that
  IFLA_IF_NETNSID is not supported on this kernel.
  If the reply does contain an IFLA_IF_NETNSID property userspace
  can send an RTM_DELLINK with a IFLA_IF_NETNSID property. If they receive
  EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property
  with RTM_DELLINK. Userpace should then fallback to other means.

- Security:
  Callers must have CAP_NET_ADMIN in the owning user namespace of the
  target network namespace.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d0c02943c05a..f111557958bb 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2639,36 +2639,53 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
-	struct net_device *dev;
+	struct net *tgt_net = net;
+	struct net_device *dev = NULL;
 	struct ifinfomsg *ifm;
 	char ifname[IFNAMSIZ];
 	struct nlattr *tb[IFLA_MAX+1];
 	int err;
+	int netnsid = -1;
 
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
 	if (err < 0)
 		return err;
 
-	if (tb[IFLA_IF_NETNSID])
-		return -EOPNOTSUPP;
-
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 
+	if (tb[IFLA_IF_NETNSID]) {
+		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+		tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+		if (IS_ERR(tgt_net))
+			return PTR_ERR(tgt_net);
+	}
+
+	err = -EINVAL;
 	ifm = nlmsg_data(nlh);
 	if (ifm->ifi_index > 0)
-		dev = __dev_get_by_index(net, ifm->ifi_index);
+		dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
 	else if (tb[IFLA_IFNAME])
-		dev = __dev_get_by_name(net, ifname);
+		dev = __dev_get_by_name(tgt_net, ifname);
 	else if (tb[IFLA_GROUP])
-		return rtnl_group_dellink(net, nla_get_u32(tb[IFLA_GROUP]));
+		err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
 	else
-		return -EINVAL;
+		goto out;
 
-	if (!dev)
-		return -ENODEV;
+	if (!dev) {
+		if (tb[IFLA_IFNAME] || ifm->ifi_index > 0)
+			err = -ENODEV;
 
-	return rtnl_delete_link(dev);
+		goto out;
+	}
+
+	err = rtnl_delete_link(dev);
+
+out:
+	if (netnsid >= 0)
+		put_net(tgt_net);
+
+	return err;
 }
 
 int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
-- 
cgit 


From c36ac8e2307334c83e8bf81ed361f0e4959d995f Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 25 Jan 2018 15:01:38 +0100
Subject: dev: always advertise the new nsid when the netns iface changes

The user should be able to follow any interface that moves to another
netns.  There is no reason to hide physical interfaces.

CC: Jiri Benc <jbenc@redhat.com>
CC: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4670ccabe23a..59987eb6511a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8529,10 +8529,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
-	if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net)
-		new_nsid = peernet2id_alloc(dev_net(dev), net);
-	else
-		new_nsid = peernet2id(dev_net(dev), net);
+	new_nsid = peernet2id_alloc(dev_net(dev), net);
 	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid);
 
 	/*
-- 
cgit 


From 38e01b30563a5b5ade7b54e5d739d16a2b02fe82 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 25 Jan 2018 15:01:39 +0100
Subject: dev: advertise the new ifindex when the netns iface changes

The goal is to let the user follow an interface that moves to another
netns.

CC: Jiri Benc <jbenc@redhat.com>
CC: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 19 ++++++++++++-------
 net/core/rtnetlink.c | 31 ++++++++++++++++++++-----------
 2 files changed, 32 insertions(+), 18 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 59987eb6511a..858501b12869 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7360,7 +7360,7 @@ static void rollback_registered_many(struct list_head *head)
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
-						     GFP_KERNEL, NULL);
+						     GFP_KERNEL, NULL, 0);
 
 		/*
 		 *	Flush the unicast and multicast chains
@@ -8473,7 +8473,7 @@ EXPORT_SYMBOL(unregister_netdev);
 
 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 {
-	int err, new_nsid;
+	int err, new_nsid, new_ifindex;
 
 	ASSERT_RTNL();
 
@@ -8529,8 +8529,16 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+
 	new_nsid = peernet2id_alloc(dev_net(dev), net);
-	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid);
+	/* If there is an ifindex conflict assign a new one */
+	if (__dev_get_by_index(net, dev->ifindex))
+		new_ifindex = dev_new_index(net);
+	else
+		new_ifindex = dev->ifindex;
+
+	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
+			    new_ifindex);
 
 	/*
 	 *	Flush the unicast and multicast chains
@@ -8544,10 +8552,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 
 	/* Actually switch the network namespace */
 	dev_net_set(dev, net);
-
-	/* If there is an ifindex conflict assign a new one */
-	if (__dev_get_by_index(net, dev->ifindex))
-		dev->ifindex = dev_new_index(net);
+	dev->ifindex = new_ifindex;
 
 	/* Send a netdev-add uevent to the new namespace */
 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f111557958bb..e04af7b7f448 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -988,6 +988,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_xdp_size() /* IFLA_XDP */
 	       + nla_total_size(4)  /* IFLA_EVENT */
 	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
+	       + nla_total_size(4)  /* IFLA_NEW_IFINDEX */
 	       + nla_total_size(1)  /* IFLA_PROTO_DOWN */
 	       + nla_total_size(4)  /* IFLA_IF_NETNSID */
 	       + nla_total_size(4)  /* IFLA_CARRIER_UP_COUNT */
@@ -1511,7 +1512,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			    struct net_device *dev, struct net *src_net,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
-			    u32 event, int *new_nsid, int tgt_netnsid)
+			    u32 event, int *new_nsid, int new_ifindex,
+			    int tgt_netnsid)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1608,6 +1610,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	if (new_nsid &&
 	    nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
 		goto nla_put_failure;
+	if (new_ifindex &&
+	    nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0)
+		goto nla_put_failure;
+
 
 	rcu_read_lock();
 	if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
@@ -1853,7 +1859,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask, 0, NULL,
+					       ext_filter_mask, 0, NULL, 0,
 					       netnsid);
 
 			if (err < 0) {
@@ -3088,7 +3094,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = rtnl_fill_ifinfo(nskb, dev, net,
 			       RTM_NEWLINK, NETLINK_CB(skb).portid,
 			       nlh->nlmsg_seq, 0, 0, ext_filter_mask,
-			       0, NULL, netnsid);
+			       0, NULL, 0, netnsid);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -3184,7 +3190,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned int change,
-				       u32 event, gfp_t flags, int *new_nsid)
+				       u32 event, gfp_t flags, int *new_nsid,
+				       int new_ifindex)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -3197,7 +3204,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 
 	err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
 			       type, 0, 0, change, 0, 0, event,
-			       new_nsid, -1);
+			       new_nsid, new_ifindex, -1);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -3220,14 +3227,15 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
 
 static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 			       unsigned int change, u32 event,
-			       gfp_t flags, int *new_nsid)
+			       gfp_t flags, int *new_nsid, int new_ifindex)
 {
 	struct sk_buff *skb;
 
 	if (dev->reg_state != NETREG_REGISTERED)
 		return;
 
-	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid);
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
+				     new_ifindex);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);
 }
@@ -3235,14 +3243,15 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 		  gfp_t flags)
 {
-	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL);
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+			   NULL, 0);
 }
 
 void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
-			 gfp_t flags, int *new_nsid)
+			 gfp_t flags, int *new_nsid, int new_ifindex)
 {
 	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
-			   new_nsid);
+			   new_nsid, new_ifindex);
 }
 
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -4642,7 +4651,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_CHANGELOWERSTATE:
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
-				   GFP_KERNEL, NULL);
+				   GFP_KERNEL, NULL, 0);
 		break;
 	default:
 		break;
-- 
cgit 


From 6a643ddb5624be7e0694d49f5765a8d41c1ab6d0 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 25 Jan 2018 18:26:22 -0800
Subject: net: introduce helper dev_change_tx_queue_len()

This patch promotes the local change_tx_queue_len() to a core
helper function, dev_change_tx_queue_len(), so that rtnetlink
and net-sysfs could share the code. This also prepares for the
following patch.

Note, the -EFAULT in the original code doesn't make sense,
we should propagate the errno from notifiers.

Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 28 ++++++++++++++++++++++++++++
 net/core/net-sysfs.c | 25 +------------------------
 net/core/rtnetlink.c | 18 +++++-------------
 3 files changed, 34 insertions(+), 37 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 858501b12869..520c24671bc5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7047,6 +7047,34 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 }
 EXPORT_SYMBOL(dev_set_mtu);
 
+/**
+ *	dev_change_tx_queue_len - Change TX queue length of a netdevice
+ *	@dev: device
+ *	@new_len: new tx queue length
+ */
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+{
+	unsigned int orig_len = dev->tx_queue_len;
+	int res;
+
+	if (new_len != (unsigned int)new_len)
+		return -ERANGE;
+
+	if (new_len != orig_len) {
+		dev->tx_queue_len = new_len;
+		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+		res = notifier_to_errno(res);
+		if (res) {
+			netdev_err(dev,
+				   "refused to change device tx_queue_len\n");
+			dev->tx_queue_len = orig_len;
+			return res;
+		}
+	}
+
+	return 0;
+}
+
 /**
  *	dev_set_group - Change group this device belongs to
  *	@dev: device
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c4a28f4667b6..60a5ad2c33ee 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -346,29 +346,6 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
 }
 NETDEVICE_SHOW_RW(flags, fmt_hex);
 
-static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
-{
-	unsigned int orig_len = dev->tx_queue_len;
-	int res;
-
-	if (new_len != (unsigned int)new_len)
-		return -ERANGE;
-
-	if (new_len != orig_len) {
-		dev->tx_queue_len = new_len;
-		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
-		res = notifier_to_errno(res);
-		if (res) {
-			netdev_err(dev,
-				   "refused to change device tx_queue_len\n");
-			dev->tx_queue_len = orig_len;
-			return -EFAULT;
-		}
-	}
-
-	return 0;
-}
-
 static ssize_t tx_queue_len_store(struct device *dev,
 				  struct device_attribute *attr,
 				  const char *buf, size_t len)
@@ -376,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev,
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
-	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
+	return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len);
 }
 NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e04af7b7f448..061e27cb6c12 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2337,19 +2337,11 @@ static int do_setlink(const struct sk_buff *skb,
 
 	if (tb[IFLA_TXQLEN]) {
 		unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]);
-		unsigned int orig_len = dev->tx_queue_len;
-
-		if (dev->tx_queue_len ^ value) {
-			dev->tx_queue_len = value;
-			err = call_netdevice_notifiers(
-			      NETDEV_CHANGE_TX_QUEUE_LEN, dev);
-			err = notifier_to_errno(err);
-			if (err) {
-				dev->tx_queue_len = orig_len;
-				goto errout;
-			}
-			status |= DO_SETLINK_MODIFIED;
-		}
+
+		err = dev_change_tx_queue_len(dev, value);
+		if (err)
+			goto errout;
+		status |= DO_SETLINK_MODIFIED;
 	}
 
 	if (tb[IFLA_GSO_MAX_SIZE]) {
-- 
cgit 


From 48bfd55e7e4149a304e89c1999436cf52d094a27 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 25 Jan 2018 18:26:23 -0800
Subject: net_sched: plug in qdisc ops change_tx_queue_len

Introduce a new qdisc ops ->change_tx_queue_len() so that
each qdisc could decide how to implement this if it wants.
Previously we simply read dev->tx_queue_len, after pfifo_fast
switches to skb array, we need this API to resize the skb array
when we change dev->tx_queue_len.

To avoid handling race conditions with TX BH, we need to
deactivate all TX queues before change the value and bring them
back after we are done, this also makes implementation easier.

Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 520c24671bc5..dda9d7b9a840 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7070,6 +7070,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 			dev->tx_queue_len = orig_len;
 			return res;
 		}
+		return dev_qdisc_change_tx_queue_len(dev);
 	}
 
 	return 0;
-- 
cgit 


From 40ca54e3a686f13117f3de0c443f8026dadf7c44 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 27 Jan 2018 10:58:43 -0800
Subject: net_sched: gen_estimator: fix lockdep splat

syzbot reported a lockdep splat in gen_new_estimator() /
est_fetch_counters() when attempting to lock est->stats_lock.

Since est_fetch_counters() is called from BH context from timer
interrupt, we need to block BH as well when calling it from process
context.

Most qdiscs use per cpu counters and are immune to the problem,
but net/sched/act_api.c and net/netfilter/xt_RATEEST.c are using
a spinlock to protect their data. They both call gen_new_estimator()
while object is created and not yet alive, so this bug could
not trigger a deadlock, only a lockdep splat.

Fixes: 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/gen_estimator.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/core')

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9834cfa21b21..0a3f88f08727 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -159,7 +159,11 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	est->intvl_log = intvl_log;
 	est->cpu_bstats = cpu_bstats;
 
+	if (stats_lock)
+		local_bh_disable();
 	est_fetch_counters(est, &b);
+	if (stats_lock)
+		local_bh_enable();
 	est->last_bytes = b.bytes;
 	est->last_packets = b.packets;
 	old = rcu_dereference_protected(*rate_est, 1);
-- 
cgit 


From 5bb8ed075428b71492734af66230aa0c07fcc515 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 29 Jan 2018 18:07:20 +0100
Subject: rtnetlink: enable IFLA_IF_NETNSID for RTM_NEWLINK

- Backwards Compatibility:
  If userspace wants to determine whether RTM_NEWLINK supports the
  IFLA_IF_NETNSID property they should first send an RTM_GETLINK request
  with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply
  does not include IFLA_IF_NETNSID userspace should assume that
  IFLA_IF_NETNSID is not supported on this kernel.
  If the reply does contain an IFLA_IF_NETNSID property userspace
  can send an RTM_NEWLINK with a IFLA_IF_NETNSID property. If they receive
  EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property
  with RTM_NEWLINK. Userpace should then fallback to other means.

- Security:
  Callers must have CAP_NET_ADMIN in the owning user namespace of the
  target network namespace.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 061e27cb6c12..204297dffd2a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2952,14 +2952,10 @@ replay:
 			name_assign_type = NET_NAME_ENUM;
 		}
 
-		dest_net = rtnl_link_get_net(net, tb);
+		dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
 		if (IS_ERR(dest_net))
 			return PTR_ERR(dest_net);
 
-		err = -EPERM;
-		if (!netlink_ns_capable(skb, dest_net->user_ns, CAP_NET_ADMIN))
-			goto out;
-
 		if (tb[IFLA_LINK_NETNSID]) {
 			int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
 
-- 
cgit