From 61792b677415b77c8db04991c22966bb8de7603e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 23 Oct 2018 16:47:16 +0200
Subject: netfilter: ipv6: fix oops when defragmenting locally generated
 fragments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unlike ipv4 and normal ipv6 defrag, netfilter ipv6 defragmentation did
not save/restore skb->dst.

This causes oops when handling locally generated ipv6 fragments, as
output path needs a valid dst.

Reported-by: Maciej Żenczykowski <zenczykowski@gmail.com>
Fixes: 84379c9afe01 ("netfilter: ipv6: nf_defrag: drop skb dst before queueing")
Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index b8ac369f98ad..d219979c3e52 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -587,11 +587,16 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	 */
 	ret = -EINPROGRESS;
 	if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
-	    fq->q.meat == fq->q.len &&
-	    nf_ct_frag6_reasm(fq, skb, dev))
-		ret = 0;
-	else
+	    fq->q.meat == fq->q.len) {
+		unsigned long orefdst = skb->_skb_refdst;
+
+		skb->_skb_refdst = 0UL;
+		if (nf_ct_frag6_reasm(fq, skb, dev))
+			ret = 0;
+		skb->_skb_refdst = orefdst;
+	} else {
 		skb_dst_drop(skb);
+	}
 
 out_unlock:
 	spin_unlock_bh(&fq->q.lock);
-- 
cgit 


From 5e91c9d9cd3fd557226ca75fed58816b9eee7e07 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 22 Oct 2018 21:49:45 +0200
Subject: netfilter: nft_osf: check if attribute is present

If the attribute is not sent, eg. old libnftnl binary, then
tb[NFTA_OSF_TTL] is NULL and kernel crashes from the _init path.

Fixes: a218dc82f0b5 ("netfilter: nft_osf: Add ttl option support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_osf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index ca5e5d8c5ef8..b13618c764ec 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -50,7 +50,7 @@ static int nft_osf_init(const struct nft_ctx *ctx,
 	int err;
 	u8 ttl;
 
-	if (nla_get_u8(tb[NFTA_OSF_TTL])) {
+	if (tb[NFTA_OSF_TTL]) {
 		ttl = nla_get_u8(tb[NFTA_OSF_TTL]);
 		if (ttl > 2)
 			return -EINVAL;
-- 
cgit 


From 4269fea768a11a447d8de620ce420f2214d4685c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 26 Oct 2018 11:14:28 +0200
Subject: Revert "netfilter: nft_numgen: add map lookups for numgen random
 operations"

Laura found a better way to do this from userspace without requiring
kernel infrastructure, revert this.

Fixes: 978d8f9055c3 ("netfilter: nft_numgen: add map lookups for numgen random operations")
Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_numgen.c | 127 ---------------------------------------------
 1 file changed, 127 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 649d1700ec5b..3cc1b3dc3c3c 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -24,7 +24,6 @@ struct nft_ng_inc {
 	u32			modulus;
 	atomic_t		counter;
 	u32			offset;
-	struct nft_set		*map;
 };
 
 static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
@@ -48,34 +47,11 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 	regs->data[priv->dreg] = nft_ng_inc_gen(priv);
 }
 
-static void nft_ng_inc_map_eval(const struct nft_expr *expr,
-				struct nft_regs *regs,
-				const struct nft_pktinfo *pkt)
-{
-	struct nft_ng_inc *priv = nft_expr_priv(expr);
-	const struct nft_set *map = priv->map;
-	const struct nft_set_ext *ext;
-	u32 result;
-	bool found;
-
-	result = nft_ng_inc_gen(priv);
-	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
-
-	if (!found)
-		return;
-
-	nft_data_copy(&regs->data[priv->dreg],
-		      nft_set_ext_data(ext), map->dlen);
-}
-
 static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
 	[NFTA_NG_DREG]		= { .type = NLA_U32 },
 	[NFTA_NG_MODULUS]	= { .type = NLA_U32 },
 	[NFTA_NG_TYPE]		= { .type = NLA_U32 },
 	[NFTA_NG_OFFSET]	= { .type = NLA_U32 },
-	[NFTA_NG_SET_NAME]	= { .type = NLA_STRING,
-				    .len = NFT_SET_MAXNAMELEN - 1 },
-	[NFTA_NG_SET_ID]	= { .type = NLA_U32 },
 };
 
 static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -101,22 +77,6 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
-static int nft_ng_inc_map_init(const struct nft_ctx *ctx,
-			       const struct nft_expr *expr,
-			       const struct nlattr * const tb[])
-{
-	struct nft_ng_inc *priv = nft_expr_priv(expr);
-	u8 genmask = nft_genmask_next(ctx->net);
-
-	nft_ng_inc_init(ctx, expr, tb);
-
-	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
-					  tb[NFTA_NG_SET_NAME],
-					  tb[NFTA_NG_SET_ID], genmask);
-
-	return PTR_ERR_OR_ZERO(priv->map);
-}
-
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
 		       u32 modulus, enum nft_ng_types type, u32 offset)
 {
@@ -143,27 +103,10 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
 			   priv->offset);
 }
 
-static int nft_ng_inc_map_dump(struct sk_buff *skb,
-			       const struct nft_expr *expr)
-{
-	const struct nft_ng_inc *priv = nft_expr_priv(expr);
-
-	if (nft_ng_dump(skb, priv->dreg, priv->modulus,
-			NFT_NG_INCREMENTAL, priv->offset) ||
-	    nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
-		goto nla_put_failure;
-
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
 struct nft_ng_random {
 	enum nft_registers      dreg:8;
 	u32			modulus;
 	u32			offset;
-	struct nft_set		*map;
 };
 
 static u32 nft_ng_random_gen(struct nft_ng_random *priv)
@@ -183,25 +126,6 @@ static void nft_ng_random_eval(const struct nft_expr *expr,
 	regs->data[priv->dreg] = nft_ng_random_gen(priv);
 }
 
-static void nft_ng_random_map_eval(const struct nft_expr *expr,
-				   struct nft_regs *regs,
-				   const struct nft_pktinfo *pkt)
-{
-	struct nft_ng_random *priv = nft_expr_priv(expr);
-	const struct nft_set *map = priv->map;
-	const struct nft_set_ext *ext;
-	u32 result;
-	bool found;
-
-	result = nft_ng_random_gen(priv);
-	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
-	if (!found)
-		return;
-
-	nft_data_copy(&regs->data[priv->dreg],
-		      nft_set_ext_data(ext), map->dlen);
-}
-
 static int nft_ng_random_init(const struct nft_ctx *ctx,
 			      const struct nft_expr *expr,
 			      const struct nlattr * const tb[])
@@ -226,21 +150,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
-static int nft_ng_random_map_init(const struct nft_ctx *ctx,
-				  const struct nft_expr *expr,
-				  const struct nlattr * const tb[])
-{
-	struct nft_ng_random *priv = nft_expr_priv(expr);
-	u8 genmask = nft_genmask_next(ctx->net);
-
-	nft_ng_random_init(ctx, expr, tb);
-	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
-					  tb[NFTA_NG_SET_NAME],
-					  tb[NFTA_NG_SET_ID], genmask);
-
-	return PTR_ERR_OR_ZERO(priv->map);
-}
-
 static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_ng_random *priv = nft_expr_priv(expr);
@@ -249,22 +158,6 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
 			   priv->offset);
 }
 
-static int nft_ng_random_map_dump(struct sk_buff *skb,
-				  const struct nft_expr *expr)
-{
-	const struct nft_ng_random *priv = nft_expr_priv(expr);
-
-	if (nft_ng_dump(skb, priv->dreg, priv->modulus,
-			NFT_NG_RANDOM, priv->offset) ||
-	    nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
-		goto nla_put_failure;
-
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
 static struct nft_expr_type nft_ng_type;
 static const struct nft_expr_ops nft_ng_inc_ops = {
 	.type		= &nft_ng_type,
@@ -274,14 +167,6 @@ static const struct nft_expr_ops nft_ng_inc_ops = {
 	.dump		= nft_ng_inc_dump,
 };
 
-static const struct nft_expr_ops nft_ng_inc_map_ops = {
-	.type		= &nft_ng_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
-	.eval		= nft_ng_inc_map_eval,
-	.init		= nft_ng_inc_map_init,
-	.dump		= nft_ng_inc_map_dump,
-};
-
 static const struct nft_expr_ops nft_ng_random_ops = {
 	.type		= &nft_ng_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
@@ -290,14 +175,6 @@ static const struct nft_expr_ops nft_ng_random_ops = {
 	.dump		= nft_ng_random_dump,
 };
 
-static const struct nft_expr_ops nft_ng_random_map_ops = {
-	.type		= &nft_ng_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
-	.eval		= nft_ng_random_map_eval,
-	.init		= nft_ng_random_map_init,
-	.dump		= nft_ng_random_map_dump,
-};
-
 static const struct nft_expr_ops *
 nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 {
@@ -312,12 +189,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 
 	switch (type) {
 	case NFT_NG_INCREMENTAL:
-		if (tb[NFTA_NG_SET_NAME])
-			return &nft_ng_inc_map_ops;
 		return &nft_ng_inc_ops;
 	case NFT_NG_RANDOM:
-		if (tb[NFTA_NG_SET_NAME])
-			return &nft_ng_random_map_ops;
 		return &nft_ng_random_ops;
 	}
 
-- 
cgit 


From 439cd39ea136d2c026805264d58a91f36b6b64ca Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sat, 14 Jul 2018 21:59:43 +0200
Subject: netfilter: ipset: list:set: Decrease refcount synchronously on
 deletion and replace

Commit 45040978c899 ("netfilter: ipset: Fix set:list type crash
when flush/dump set in parallel") postponed decreasing set
reference counters to the RCU callback.

An 'ipset del' command can terminate before the RCU grace period
is elapsed, and if sets are listed before then, the reference
counter shown in userspace will be wrong:

 # ipset create h hash:ip; ipset create l list:set; ipset add l
 # ipset del l h; ipset list h
 Name: h
 Type: hash:ip
 Revision: 4
 Header: family inet hashsize 1024 maxelem 65536
 Size in memory: 88
 References: 1
 Number of entries: 0
 Members:
 # sleep 1; ipset list h
 Name: h
 Type: hash:ip
 Revision: 4
 Header: family inet hashsize 1024 maxelem 65536
 Size in memory: 88
 References: 0
 Number of entries: 0
 Members:

Fix this by making the reference count update synchronous again.

As a result, when sets are listed, ip_set_name_byindex() might
now fetch a set whose reference count is already zero. Instead
of relying on the reference count to protect against concurrent
set renaming, grab ip_set_ref_lock as reader and copy the name,
while holding the same lock in ip_set_rename() as writer
instead.

Reported-by: Li Shuang <shuali@redhat.com>
Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c     | 23 +++++++++++------------
 net/netfilter/ipset/ip_set_list_set.c | 17 +++++++++++------
 2 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index bc4bd247bb7d..fa15a831aeee 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -693,21 +693,20 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index)
 EXPORT_SYMBOL_GPL(ip_set_put_byindex);
 
 /* Get the name of a set behind a set index.
- * We assume the set is referenced, so it does exist and
- * can't be destroyed. The set cannot be renamed due to
- * the referencing either.
- *
+ * Set itself is protected by RCU, but its name isn't: to protect against
+ * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the
+ * name.
  */
-const char *
-ip_set_name_byindex(struct net *net, ip_set_id_t index)
+void
+ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name)
 {
-	const struct ip_set *set = ip_set_rcu_get(net, index);
+	struct ip_set *set = ip_set_rcu_get(net, index);
 
 	BUG_ON(!set);
-	BUG_ON(set->ref == 0);
 
-	/* Referenced, so it's safe */
-	return set->name;
+	read_lock_bh(&ip_set_ref_lock);
+	strncpy(name, set->name, IPSET_MAXNAMELEN);
+	read_unlock_bh(&ip_set_ref_lock);
 }
 EXPORT_SYMBOL_GPL(ip_set_name_byindex);
 
@@ -1153,7 +1152,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
 	if (!set)
 		return -ENOENT;
 
-	read_lock_bh(&ip_set_ref_lock);
+	write_lock_bh(&ip_set_ref_lock);
 	if (set->ref != 0) {
 		ret = -IPSET_ERR_REFERENCED;
 		goto out;
@@ -1170,7 +1169,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
 	strncpy(set->name, name2, IPSET_MAXNAMELEN);
 
 out:
-	read_unlock_bh(&ip_set_ref_lock);
+	write_unlock_bh(&ip_set_ref_lock);
 	return ret;
 }
 
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 072a658fde04..4eef55da0878 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -148,9 +148,7 @@ __list_set_del_rcu(struct rcu_head * rcu)
 {
 	struct set_elem *e = container_of(rcu, struct set_elem, rcu);
 	struct ip_set *set = e->set;
-	struct list_set *map = set->data;
 
-	ip_set_put_byindex(map->net, e->id);
 	ip_set_ext_destroy(set, e);
 	kfree(e);
 }
@@ -158,15 +156,21 @@ __list_set_del_rcu(struct rcu_head * rcu)
 static inline void
 list_set_del(struct ip_set *set, struct set_elem *e)
 {
+	struct list_set *map = set->data;
+
 	set->elements--;
 	list_del_rcu(&e->list);
+	ip_set_put_byindex(map->net, e->id);
 	call_rcu(&e->rcu, __list_set_del_rcu);
 }
 
 static inline void
-list_set_replace(struct set_elem *e, struct set_elem *old)
+list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
 {
+	struct list_set *map = set->data;
+
 	list_replace_rcu(&old->list, &e->list);
+	ip_set_put_byindex(map->net, old->id);
 	call_rcu(&old->rcu, __list_set_del_rcu);
 }
 
@@ -298,7 +302,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	INIT_LIST_HEAD(&e->list);
 	list_set_init_extensions(set, ext, e);
 	if (n)
-		list_set_replace(e, n);
+		list_set_replace(set, e, n);
 	else if (next)
 		list_add_tail_rcu(&e->list, &next->list);
 	else if (prev)
@@ -486,6 +490,7 @@ list_set_list(const struct ip_set *set,
 	const struct list_set *map = set->data;
 	struct nlattr *atd, *nested;
 	u32 i = 0, first = cb->args[IPSET_CB_ARG0];
+	char name[IPSET_MAXNAMELEN];
 	struct set_elem *e;
 	int ret = 0;
 
@@ -504,8 +509,8 @@ list_set_list(const struct ip_set *set,
 		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
 		if (!nested)
 			goto nla_put_failure;
-		if (nla_put_string(skb, IPSET_ATTR_NAME,
-				   ip_set_name_byindex(map->net, e->id)))
+		ip_set_name_byindex(map->net, e->id, name);
+		if (nla_put_string(skb, IPSET_ATTR_NAME, name))
 			goto nla_put_failure;
 		if (ip_set_put_extensions(skb, set, e, true))
 			goto nla_put_failure;
-- 
cgit 


From 886503f34d63e681662057448819edb5b1057a97 Mon Sep 17 00:00:00 2001
From: Eric Westbrook <eric@westbrook.io>
Date: Tue, 28 Aug 2018 15:14:42 -0600
Subject: netfilter: ipset: actually allow allowable CIDR 0 in
 hash:net,port,net

Allow /0 as advertised for hash:net,port,net sets.

For "hash:net,port,net", ipset(8) says that "either subnet
is permitted to be a /0 should you wish to match port
between all destinations."

Make that statement true.

Before:

    # ipset create cidrzero hash:net,port,net
    # ipset add cidrzero 0.0.0.0/0,12345,0.0.0.0/0
    ipset v6.34: The value of the CIDR parameter of the IP address is invalid

    # ipset create cidrzero6 hash:net,port,net family inet6
    # ipset add cidrzero6 ::/0,12345,::/0
    ipset v6.34: The value of the CIDR parameter of the IP address is invalid

After:

    # ipset create cidrzero hash:net,port,net
    # ipset add cidrzero 0.0.0.0/0,12345,0.0.0.0/0
    # ipset test cidrzero 192.168.205.129,12345,172.16.205.129
    192.168.205.129,tcp:12345,172.16.205.129 is in set cidrzero.

    # ipset create cidrzero6 hash:net,port,net family inet6
    # ipset add cidrzero6 ::/0,12345,::/0
    # ipset test cidrzero6 fe80::1,12345,ff00::1
    fe80::1,tcp:12345,ff00::1 is in set cidrzero6.

See also:

  https://bugzilla.kernel.org/show_bug.cgi?id=200897
  https://github.com/ewestbrook/linux/commit/df7ff6efb0934ab6acc11f003ff1a7580d6c1d9c

Signed-off-by: Eric Westbrook <linux@westbrook.io>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_hash_netportnet.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index d391485a6acd..613e18e720a4 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -213,13 +213,13 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (tb[IPSET_ATTR_CIDR]) {
 		e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
-		if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
+		if (e.cidr[0] > HOST_MASK)
 			return -IPSET_ERR_INVALID_CIDR;
 	}
 
 	if (tb[IPSET_ATTR_CIDR2]) {
 		e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
-		if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
+		if (e.cidr[1] > HOST_MASK)
 			return -IPSET_ERR_INVALID_CIDR;
 	}
 
@@ -493,13 +493,13 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (tb[IPSET_ATTR_CIDR]) {
 		e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
-		if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
+		if (e.cidr[0] > HOST_MASK)
 			return -IPSET_ERR_INVALID_CIDR;
 	}
 
 	if (tb[IPSET_ATTR_CIDR2]) {
 		e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
-		if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
+		if (e.cidr[1] > HOST_MASK)
 			return -IPSET_ERR_INVALID_CIDR;
 	}
 
-- 
cgit 


From ed956f3947a01ff9875cd908d7c1ef1fe7f47bf0 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 22 Oct 2018 23:30:40 +0200
Subject: netfilter: ipset: fix ip_set_list allocation failure

ip_set_create() and ip_set_net_init() attempt to allocate physically
contiguous memory for ip_set_list. If memory is fragmented, the
allocations could easily fail:

        vzctl: page allocation failure: order:7, mode:0xc0d0

        Call Trace:
         dump_stack+0x19/0x1b
         warn_alloc_failed+0x110/0x180
         __alloc_pages_nodemask+0x7bf/0xc60
         alloc_pages_current+0x98/0x110
         kmalloc_order+0x18/0x40
         kmalloc_order_trace+0x26/0xa0
         __kmalloc+0x279/0x290
         ip_set_net_init+0x4b/0x90 [ip_set]
         ops_init+0x3b/0xb0
         setup_net+0xbb/0x170
         copy_net_ns+0xf1/0x1c0
         create_new_namespaces+0xf9/0x180
         copy_namespaces+0x8e/0xd0
         copy_process+0xb61/0x1a00
         do_fork+0x91/0x320

Use kvcalloc() to fallback to 0-order allocations if high order
page isn't available.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index fa15a831aeee..68db946df151 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -960,7 +960,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
 			/* Wraparound */
 			goto cleanup;
 
-		list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL);
+		list = kvcalloc(i, sizeof(struct ip_set *), GFP_KERNEL);
 		if (!list)
 			goto cleanup;
 		/* nfnl mutex is held, both lists are valid */
@@ -972,7 +972,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
 		/* Use new list */
 		index = inst->ip_set_max;
 		inst->ip_set_max = i;
-		kfree(tmp);
+		kvfree(tmp);
 		ret = 0;
 	} else if (ret) {
 		goto cleanup;
@@ -2058,7 +2058,7 @@ ip_set_net_init(struct net *net)
 	if (inst->ip_set_max >= IPSET_INVALID_ID)
 		inst->ip_set_max = IPSET_INVALID_ID - 1;
 
-	list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL);
+	list = kvcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL);
 	if (!list)
 		return -ENOMEM;
 	inst->is_deleted = false;
@@ -2086,7 +2086,7 @@ ip_set_net_exit(struct net *net)
 		}
 	}
 	nfnl_unlock(NFNL_SUBSYS_IPSET);
-	kfree(rcu_dereference_protected(inst->ip_set_list, 1));
+	kvfree(rcu_dereference_protected(inst->ip_set_list, 1));
 }
 
 static struct pernet_operations ip_set_net_ops = {
-- 
cgit 


From 7de414a9dd91426318df7b63da024b2b07e53df5 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 1 Nov 2018 12:02:37 -0700
Subject: net: drop skb on failure in ip_check_defrag()

Most callers of pskb_trim_rcsum() simply drop the skb when
it fails, however, ip_check_defrag() still continues to pass
the skb up to stack. This is suspicious.

In ip_check_defrag(), after we learn the skb is an IP fragment,
passing the skb to callers makes no sense, because callers expect
fragments are defrag'ed on success. So, dropping the skb when we
can't defrag it is reasonable.

Note, prior to commit 88078d98d1bb, this is not a big problem as
checksum will be fixed up anyway. After it, the checksum is not
correct on failure.

Found this during code review.

Fixes: 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends")
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9b0158fa431f..d6ee343fdb86 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -722,10 +722,14 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
 	if (ip_is_fragment(&iph)) {
 		skb = skb_share_check(skb, GFP_ATOMIC);
 		if (skb) {
-			if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
-				return skb;
-			if (pskb_trim_rcsum(skb, netoff + len))
-				return skb;
+			if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) {
+				kfree_skb(skb);
+				return NULL;
+			}
+			if (pskb_trim_rcsum(skb, netoff + len)) {
+				kfree_skb(skb);
+				return NULL;
+			}
 			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
 			if (ip_defrag(net, skb, user))
 				return NULL;
-- 
cgit 


From 49682bfa1e0e448a711471a5db83be0df1fb39a2 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Wed, 31 Oct 2018 13:16:58 +0100
Subject: net: document skb parameter in function 'skb_gso_size_check'

Remove kernel-doc warning:

  net/core/skbuff.c:4953: warning: Function parameter or member 'skb' not described in 'skb_gso_size_check'

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 946de0e24c87..b4ee5c8b928f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4944,6 +4944,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
  *
  * This is a helper to do that correctly considering GSO_BY_FRAGS.
  *
+ * @skb: GSO skb
+ *
  * @seg_len: The segmented length (from skb_gso_*_seglen). In the
  *           GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
  *
-- 
cgit 


From 2384d02520ff2a916169b2fd85ea50e923ed56c2 Mon Sep 17 00:00:00 2001
From: Jeff Barnhill <0xeffeff@gmail.com>
Date: Fri, 2 Nov 2018 20:23:57 +0000
Subject: net/ipv6: Add anycast addresses to a global hashtable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

icmp6_send() function is expensive on systems with a large number of
interfaces. Every time it’s called, it has to verify that the source
address does not correspond to an existing anycast address by looping
through every device and every anycast address on the device.  This can
result in significant delays for a CPU when there are a large number of
neighbors and ND timers are frequently timing out and calling
neigh_invalidate().

Add anycast addresses to a global hashtable to allow quick searching for
matching anycast addresses.  This is based on inet6_addr_lst in addrconf.c.

Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c |  5 ++++
 net/ipv6/anycast.c  | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 81 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3f4d61017a69..f0cd291034f0 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1001,6 +1001,9 @@ static int __init inet6_init(void)
 	err = ip6_flowlabel_init();
 	if (err)
 		goto ip6_flowlabel_fail;
+	err = ipv6_anycast_init();
+	if (err)
+		goto ipv6_anycast_fail;
 	err = addrconf_init();
 	if (err)
 		goto addrconf_fail;
@@ -1091,6 +1094,8 @@ ipv6_frag_fail:
 ipv6_exthdrs_fail:
 	addrconf_cleanup();
 addrconf_fail:
+	ipv6_anycast_cleanup();
+ipv6_anycast_fail:
 	ip6_flowlabel_cleanup();
 ip6_flowlabel_fail:
 	ndisc_late_cleanup();
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..7698637cf827 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -44,8 +44,22 @@
 
 #include <net/checksum.h>
 
+#define IN6_ADDR_HSIZE_SHIFT	8
+#define IN6_ADDR_HSIZE		BIT(IN6_ADDR_HSIZE_SHIFT)
+/*	anycast address hash table
+ */
+static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(acaddr_hash_lock);
+
 static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr);
 
+static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+	u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);
+
+	return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
+}
+
 /*
  *	socket join an anycast group
  */
@@ -204,16 +218,39 @@ void ipv6_sock_ac_close(struct sock *sk)
 	rtnl_unlock();
 }
 
+static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca)
+{
+	unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr);
+
+	spin_lock(&acaddr_hash_lock);
+	hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]);
+	spin_unlock(&acaddr_hash_lock);
+}
+
+static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca)
+{
+	spin_lock(&acaddr_hash_lock);
+	hlist_del_init_rcu(&aca->aca_addr_lst);
+	spin_unlock(&acaddr_hash_lock);
+}
+
 static void aca_get(struct ifacaddr6 *aca)
 {
 	refcount_inc(&aca->aca_refcnt);
 }
 
+static void aca_free_rcu(struct rcu_head *h)
+{
+	struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu);
+
+	fib6_info_release(aca->aca_rt);
+	kfree(aca);
+}
+
 static void aca_put(struct ifacaddr6 *ac)
 {
 	if (refcount_dec_and_test(&ac->aca_refcnt)) {
-		fib6_info_release(ac->aca_rt);
-		kfree(ac);
+		call_rcu(&ac->rcu, aca_free_rcu);
 	}
 }
 
@@ -229,6 +266,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
 	aca->aca_addr = *addr;
 	fib6_info_hold(f6i);
 	aca->aca_rt = f6i;
+	INIT_HLIST_NODE(&aca->aca_addr_lst);
 	aca->aca_users = 1;
 	/* aca_tstamp should be updated upon changes */
 	aca->aca_cstamp = aca->aca_tstamp = jiffies;
@@ -285,6 +323,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	aca_get(aca);
 	write_unlock_bh(&idev->lock);
 
+	ipv6_add_acaddr_hash(net, aca);
+
 	ip6_ins_rt(net, f6i);
 
 	addrconf_join_solict(idev->dev, &aca->aca_addr);
@@ -325,6 +365,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 	else
 		idev->ac_list = aca->aca_next;
 	write_unlock_bh(&idev->lock);
+	ipv6_del_acaddr_hash(aca);
 	addrconf_leave_solict(idev, &aca->aca_addr);
 
 	ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
@@ -352,6 +393,8 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
 		idev->ac_list = aca->aca_next;
 		write_unlock_bh(&idev->lock);
 
+		ipv6_del_acaddr_hash(aca);
+
 		addrconf_leave_solict(idev, &aca->aca_addr);
 
 		ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
@@ -390,17 +433,25 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad
 bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
 			 const struct in6_addr *addr)
 {
+	unsigned int hash = inet6_acaddr_hash(net, addr);
+	struct net_device *nh_dev;
+	struct ifacaddr6 *aca;
 	bool found = false;
 
 	rcu_read_lock();
 	if (dev)
 		found = ipv6_chk_acast_dev(dev, addr);
 	else
-		for_each_netdev_rcu(net, dev)
-			if (ipv6_chk_acast_dev(dev, addr)) {
+		hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash],
+					 aca_addr_lst) {
+			nh_dev = fib6_info_nh_dev(aca->aca_rt);
+			if (!nh_dev || !net_eq(dev_net(nh_dev), net))
+				continue;
+			if (ipv6_addr_equal(&aca->aca_addr, addr)) {
 				found = true;
 				break;
 			}
+		}
 	rcu_read_unlock();
 	return found;
 }
@@ -539,4 +590,25 @@ void ac6_proc_exit(struct net *net)
 {
 	remove_proc_entry("anycast6", net->proc_net);
 }
+
+/*	Init / cleanup code
+ */
+int __init ipv6_anycast_init(void)
+{
+	int i;
+
+	for (i = 0; i < IN6_ADDR_HSIZE; i++)
+		INIT_HLIST_HEAD(&inet6_acaddr_lst[i]);
+	return 0;
+}
+
+void ipv6_anycast_cleanup(void)
+{
+	int i;
+
+	spin_lock(&acaddr_hash_lock);
+	for (i = 0; i < IN6_ADDR_HSIZE; i++)
+		WARN_ON(!hlist_empty(&inet6_acaddr_lst[i]));
+	spin_unlock(&acaddr_hash_lock);
+}
 #endif
-- 
cgit 


From c7e86acfcee30794dc99a0759924bf7b9d43f1ca Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 13:39:53 +0000
Subject: rxrpc: Fix lockup due to no error backoff after ack transmit error

If the network becomes (partially) unavailable, say by disabling IPv6, the
background ACK transmission routine can get itself into a tizzy by
proposing immediate ACK retransmission.  Since we're in the call event
processor, that happens immediately without returning to the workqueue
manager.

The condition should clear after a while when either the network comes back
or the call times out.

Fix this by:

 (1) When re-proposing an ACK on failed Tx, don't schedule it immediately.
     This will allow a certain amount of time to elapse before we try
     again.

 (2) Enforce a return to the workqueue manager after a certain number of
     iterations of the call processing loop.

 (3) Add a backoff delay that increases the delay on deferred ACKs by a
     jiffy per failed transmission to a limit of HZ.  The backoff delay is
     cleared on a successful return from kernel_sendmsg().

 (4) Cancel calls immediately if the opening sendmsg fails.  The layer
     above can arrange retransmission or rotate to another server.

Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/ar-internal.h |  1 +
 net/rxrpc/call_event.c  | 18 ++++++++++++++----
 net/rxrpc/output.c      | 35 +++++++++++++++++++++++++++++++----
 3 files changed, 46 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 382196e57a26..bc628acf4f4f 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -611,6 +611,7 @@ struct rxrpc_call {
 						 * not hard-ACK'd packet follows this.
 						 */
 	rxrpc_seq_t		tx_top;		/* Highest Tx slot allocated. */
+	u16			tx_backoff;	/* Delay to insert due to Tx failure */
 
 	/* TCP-style slow-start congestion control [RFC5681].  Since the SMSS
 	 * is fixed, we keep these numbers in terms of segments (ie. DATA
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 8e7434e92097..468efc3660c0 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -123,6 +123,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		else
 			ack_at = expiry;
 
+		ack_at += READ_ONCE(call->tx_backoff);
 		ack_at += now;
 		if (time_before(ack_at, call->ack_at)) {
 			WRITE_ONCE(call->ack_at, ack_at);
@@ -311,6 +312,7 @@ void rxrpc_process_call(struct work_struct *work)
 		container_of(work, struct rxrpc_call, processor);
 	rxrpc_serial_t *send_ack;
 	unsigned long now, next, t;
+	unsigned int iterations = 0;
 
 	rxrpc_see_call(call);
 
@@ -319,6 +321,11 @@ void rxrpc_process_call(struct work_struct *work)
 	       call->debug_id, rxrpc_call_states[call->state], call->events);
 
 recheck_state:
+	/* Limit the number of times we do this before returning to the manager */
+	iterations++;
+	if (iterations > 5)
+		goto requeue;
+
 	if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
 		rxrpc_send_abort_packet(call);
 		goto recheck_state;
@@ -447,13 +454,16 @@ recheck_state:
 	rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart);
 
 	/* other events may have been raised since we started checking */
-	if (call->events && call->state < RXRPC_CALL_COMPLETE) {
-		__rxrpc_queue_call(call);
-		goto out;
-	}
+	if (call->events && call->state < RXRPC_CALL_COMPLETE)
+		goto requeue;
 
 out_put:
 	rxrpc_put_call(call, rxrpc_call_put);
 out:
 	_leave("");
+	return;
+
+requeue:
+	__rxrpc_queue_call(call);
+	goto out;
 }
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 189418888839..736aa9281100 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -34,6 +34,21 @@ struct rxrpc_abort_buffer {
 
 static const char rxrpc_keepalive_string[] = "";
 
+/*
+ * Increase Tx backoff on transmission failure and clear it on success.
+ */
+static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret)
+{
+	if (ret < 0) {
+		u16 tx_backoff = READ_ONCE(call->tx_backoff);
+
+		if (tx_backoff < HZ)
+			WRITE_ONCE(call->tx_backoff, tx_backoff + 1);
+	} else {
+		WRITE_ONCE(call->tx_backoff, 0);
+	}
+}
+
 /*
  * Arrange for a keepalive ping a certain time after we last transmitted.  This
  * lets the far side know we're still interested in this call and helps keep
@@ -210,6 +225,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
 	else
 		trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr,
 				      rxrpc_tx_point_call_ack);
+	rxrpc_tx_backoff(call, ret);
 
 	if (call->state < RXRPC_CALL_COMPLETE) {
 		if (ret < 0) {
@@ -218,7 +234,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
 			rxrpc_propose_ACK(call, pkt->ack.reason,
 					  ntohs(pkt->ack.maxSkew),
 					  ntohl(pkt->ack.serial),
-					  true, true,
+					  false, true,
 					  rxrpc_propose_ack_retry_tx);
 		} else {
 			spin_lock_bh(&call->lock);
@@ -300,7 +316,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
 	else
 		trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr,
 				      rxrpc_tx_point_call_abort);
-
+	rxrpc_tx_backoff(call, ret);
 
 	rxrpc_put_connection(conn);
 	return ret;
@@ -413,6 +429,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	else
 		trace_rxrpc_tx_packet(call->debug_id, &whdr,
 				      rxrpc_tx_point_call_data_nofrag);
+	rxrpc_tx_backoff(call, ret);
 	if (ret == -EMSGSIZE)
 		goto send_fragmentable;
 
@@ -445,9 +462,18 @@ done:
 			rxrpc_reduce_call_timer(call, expect_rx_by, nowj,
 						rxrpc_timer_set_for_normal);
 		}
-	}
 
-	rxrpc_set_keepalive(call);
+		rxrpc_set_keepalive(call);
+	} else {
+		/* Cancel the call if the initial transmission fails,
+		 * particularly if that's due to network routing issues that
+		 * aren't going away anytime soon.  The layer above can arrange
+		 * the retransmission.
+		 */
+		if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags))
+			rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
+						  RX_USER_ABORT, ret);
+	}
 
 	_leave(" = %d [%u]", ret, call->peer->maxdata);
 	return ret;
@@ -506,6 +532,7 @@ send_fragmentable:
 	else
 		trace_rxrpc_tx_packet(call->debug_id, &whdr,
 				      rxrpc_tx_point_call_data_frag);
+	rxrpc_tx_backoff(call, ret);
 
 	up_write(&conn->params.local->defrag_sem);
 	goto done;
-- 
cgit 


From 54451f60c8fa061af9051a53be9786393947367c Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Sun, 21 Oct 2018 00:00:08 +0900
Subject: netfilter: xt_IDLETIMER: add sysfs filename checking routine

When IDLETIMER rule is added, sysfs file is created under
/sys/class/xt_idletimer/timers/
But some label name shouldn't be used.
".", "..", "power", "uevent", "subsystem", etc...
So that sysfs filename checking routine is needed.

test commands:
   %iptables -I INPUT -j IDLETIMER --timeout 1 --label "power"

splat looks like:
[95765.423132] sysfs: cannot create duplicate filename '/devices/virtual/xt_idletimer/timers/power'
[95765.433418] CPU: 0 PID: 8446 Comm: iptables Not tainted 4.19.0-rc6+ #20
[95765.449755] Call Trace:
[95765.449755]  dump_stack+0xc9/0x16b
[95765.449755]  ? show_regs_print_info+0x5/0x5
[95765.449755]  sysfs_warn_dup+0x74/0x90
[95765.449755]  sysfs_add_file_mode_ns+0x352/0x500
[95765.449755]  sysfs_create_file_ns+0x179/0x270
[95765.449755]  ? sysfs_add_file_mode_ns+0x500/0x500
[95765.449755]  ? idletimer_tg_checkentry+0x3e5/0xb1b [xt_IDLETIMER]
[95765.449755]  ? rcu_read_lock_sched_held+0x114/0x130
[95765.449755]  ? __kmalloc_track_caller+0x211/0x2b0
[95765.449755]  ? memcpy+0x34/0x50
[95765.449755]  idletimer_tg_checkentry+0x4e2/0xb1b [xt_IDLETIMER]
[ ... ]

Fixes: 0902b469bd25 ("netfilter: xtables: idletimer target implementation")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_IDLETIMER.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index c6acfc2d9c84..eb4cbd244c3d 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -114,6 +114,22 @@ static void idletimer_tg_expired(struct timer_list *t)
 	schedule_work(&timer->work);
 }
 
+static int idletimer_check_sysfs_name(const char *name, unsigned int size)
+{
+	int ret;
+
+	ret = xt_check_proc_name(name, size);
+	if (ret < 0)
+		return ret;
+
+	if (!strcmp(name, "power") ||
+	    !strcmp(name, "subsystem") ||
+	    !strcmp(name, "uevent"))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int idletimer_tg_create(struct idletimer_tg_info *info)
 {
 	int ret;
@@ -124,6 +140,10 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
 		goto out;
 	}
 
+	ret = idletimer_check_sysfs_name(info->label, sizeof(info->label));
+	if (ret < 0)
+		goto out_free_timer;
+
 	sysfs_attr_init(&info->timer->attr.attr);
 	info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
 	if (!info->timer->attr.attr.name) {
-- 
cgit 


From 8a02bdd50b2ecb6d62121d2958d3ea186cc88ce7 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Tue, 30 Oct 2018 22:43:42 +0100
Subject: netfilter: ipset: Fix calling ip_set() macro at dumping

The ip_set() macro is called when either ip_set_ref_lock held only
or no lock/nfnl mutex is held at dumping. Take this into account
properly. Also, use Pablo's suggestion to use rcu_dereference_raw(),
the ref_netlink protects the set.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 68db946df151..1577f2f76060 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -55,11 +55,15 @@ MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
 MODULE_DESCRIPTION("core IP set support");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
 
-/* When the nfnl mutex is held: */
+/* When the nfnl mutex or ip_set_ref_lock is held: */
 #define ip_set_dereference(p)		\
-	rcu_dereference_protected(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
+	rcu_dereference_protected(p,	\
+		lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
+		lockdep_is_held(&ip_set_ref_lock))
 #define ip_set(inst, id)		\
 	ip_set_dereference((inst)->ip_set_list)[id]
+#define ip_set_ref_netlink(inst,id)	\
+	rcu_dereference_raw((inst)->ip_set_list)[id]
 
 /* The set types are implemented in modules and registered set types
  * can be found in ip_set_type_list. Adding/deleting types is
@@ -1251,7 +1255,7 @@ ip_set_dump_done(struct netlink_callback *cb)
 		struct ip_set_net *inst =
 			(struct ip_set_net *)cb->args[IPSET_CB_NET];
 		ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
-		struct ip_set *set = ip_set(inst, index);
+		struct ip_set *set = ip_set_ref_netlink(inst, index);
 
 		if (set->variant->uref)
 			set->variant->uref(set, cb, false);
@@ -1440,7 +1444,7 @@ next_set:
 release_refcount:
 	/* If there was an error or set is done, release set */
 	if (ret || !cb->args[IPSET_CB_ARG0]) {
-		set = ip_set(inst, index);
+		set = ip_set_ref_netlink(inst, index);
 		if (set->variant->uref)
 			set->variant->uref(set, cb, false);
 		pr_debug("release set %s\n", set->name);
-- 
cgit 


From a95a7774d51e13f9cf4b7285666829b68852f07a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 2 Nov 2018 00:11:34 +0100
Subject: netfilter: conntrack: add
 nf_{tcp,udp,sctp,icmp,dccp,icmpv6,generic}_pernet()

Expose these functions to access conntrack protocol tracker netns area,
nfnetlink_cttimeout needs this.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_dccp.c    | 13 ++++---------
 net/netfilter/nf_conntrack_proto_generic.c | 11 +++--------
 net/netfilter/nf_conntrack_proto_icmp.c    | 11 +++--------
 net/netfilter/nf_conntrack_proto_icmpv6.c  | 11 +++--------
 net/netfilter/nf_conntrack_proto_sctp.c    | 11 +++--------
 net/netfilter/nf_conntrack_proto_tcp.c     | 15 +++++----------
 net/netfilter/nf_conntrack_proto_udp.c     | 11 +++--------
 7 files changed, 24 insertions(+), 59 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 171e9e122e5f..023c1445bc39 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -384,11 +384,6 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
 	},
 };
 
-static inline struct nf_dccp_net *dccp_pernet(struct net *net)
-{
-	return &net->ct.nf_ct_proto.dccp;
-}
-
 static noinline bool
 dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	 const struct dccp_hdr *dh)
@@ -401,7 +396,7 @@ dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
 	switch (state) {
 	default:
-		dn = dccp_pernet(net);
+		dn = nf_dccp_pernet(net);
 		if (dn->dccp_loose == 0) {
 			msg = "not picking up existing connection ";
 			goto out_invalid;
@@ -568,7 +563,7 @@ static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
 
 	timeouts = nf_ct_timeout_lookup(ct);
 	if (!timeouts)
-		timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout;
+		timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout;
 	nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
 
 	return NF_ACCEPT;
@@ -681,7 +676,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
 static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
 				      struct net *net, void *data)
 {
-	struct nf_dccp_net *dn = dccp_pernet(net);
+	struct nf_dccp_net *dn = nf_dccp_pernet(net);
 	unsigned int *timeouts = data;
 	int i;
 
@@ -814,7 +809,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
 
 static int dccp_init_net(struct net *net)
 {
-	struct nf_dccp_net *dn = dccp_pernet(net);
+	struct nf_dccp_net *dn = nf_dccp_pernet(net);
 	struct nf_proto_net *pn = &dn->pn;
 
 	if (!pn->users) {
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index e10e867e0b55..5da19d5fbc76 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -27,11 +27,6 @@ static bool nf_generic_should_process(u8 proto)
 	}
 }
 
-static inline struct nf_generic_net *generic_pernet(struct net *net)
-{
-	return &net->ct.nf_ct_proto.generic;
-}
-
 static bool generic_pkt_to_tuple(const struct sk_buff *skb,
 				 unsigned int dataoff,
 				 struct net *net, struct nf_conntrack_tuple *tuple)
@@ -58,7 +53,7 @@ static int generic_packet(struct nf_conn *ct,
 	}
 
 	if (!timeout)
-		timeout = &generic_pernet(nf_ct_net(ct))->timeout;
+		timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;
 
 	nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
 	return NF_ACCEPT;
@@ -72,7 +67,7 @@ static int generic_packet(struct nf_conn *ct,
 static int generic_timeout_nlattr_to_obj(struct nlattr *tb[],
 					 struct net *net, void *data)
 {
-	struct nf_generic_net *gn = generic_pernet(net);
+	struct nf_generic_net *gn = nf_generic_pernet(net);
 	unsigned int *timeout = data;
 
 	if (!timeout)
@@ -138,7 +133,7 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
 
 static int generic_init_net(struct net *net)
 {
-	struct nf_generic_net *gn = generic_pernet(net);
+	struct nf_generic_net *gn = nf_generic_pernet(net);
 	struct nf_proto_net *pn = &gn->pn;
 
 	gn->timeout = nf_ct_generic_timeout;
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 3598520bd19b..de64d8a5fdfd 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -25,11 +25,6 @@
 
 static const unsigned int nf_ct_icmp_timeout = 30*HZ;
 
-static inline struct nf_icmp_net *icmp_pernet(struct net *net)
-{
-	return &net->ct.nf_ct_proto.icmp;
-}
-
 static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 			      struct net *net, struct nf_conntrack_tuple *tuple)
 {
@@ -103,7 +98,7 @@ static int icmp_packet(struct nf_conn *ct,
 	}
 
 	if (!timeout)
-		timeout = &icmp_pernet(nf_ct_net(ct))->timeout;
+		timeout = &nf_icmp_pernet(nf_ct_net(ct))->timeout;
 
 	nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
 	return NF_ACCEPT;
@@ -275,7 +270,7 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
 				      struct net *net, void *data)
 {
 	unsigned int *timeout = data;
-	struct nf_icmp_net *in = icmp_pernet(net);
+	struct nf_icmp_net *in = nf_icmp_pernet(net);
 
 	if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
 		if (!timeout)
@@ -337,7 +332,7 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
 
 static int icmp_init_net(struct net *net)
 {
-	struct nf_icmp_net *in = icmp_pernet(net);
+	struct nf_icmp_net *in = nf_icmp_pernet(net);
 	struct nf_proto_net *pn = &in->pn;
 
 	in->timeout = nf_ct_icmp_timeout;
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 378618feed5d..a15eefb8e317 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -30,11 +30,6 @@
 
 static const unsigned int nf_ct_icmpv6_timeout = 30*HZ;
 
-static inline struct nf_icmp_net *icmpv6_pernet(struct net *net)
-{
-	return &net->ct.nf_ct_proto.icmpv6;
-}
-
 static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb,
 				unsigned int dataoff,
 				struct net *net,
@@ -87,7 +82,7 @@ static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 
 static unsigned int *icmpv6_get_timeouts(struct net *net)
 {
-	return &icmpv6_pernet(net)->timeout;
+	return &nf_icmpv6_pernet(net)->timeout;
 }
 
 /* Returns verdict for packet, or -1 for invalid. */
@@ -286,7 +281,7 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[],
 					struct net *net, void *data)
 {
 	unsigned int *timeout = data;
-	struct nf_icmp_net *in = icmpv6_pernet(net);
+	struct nf_icmp_net *in = nf_icmpv6_pernet(net);
 
 	if (!timeout)
 		timeout = icmpv6_get_timeouts(net);
@@ -348,7 +343,7 @@ static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn,
 
 static int icmpv6_init_net(struct net *net)
 {
-	struct nf_icmp_net *in = icmpv6_pernet(net);
+	struct nf_icmp_net *in = nf_icmpv6_pernet(net);
 	struct nf_proto_net *pn = &in->pn;
 
 	in->timeout = nf_ct_icmpv6_timeout;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 3d719d3eb9a3..d53e3e78f605 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -146,11 +146,6 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
 	}
 };
 
-static inline struct nf_sctp_net *sctp_pernet(struct net *net)
-{
-	return &net->ct.nf_ct_proto.sctp;
-}
-
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 /* Print out the private part of the conntrack. */
 static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -480,7 +475,7 @@ static int sctp_packet(struct nf_conn *ct,
 
 	timeouts = nf_ct_timeout_lookup(ct);
 	if (!timeouts)
-		timeouts = sctp_pernet(nf_ct_net(ct))->timeouts;
+		timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts;
 
 	nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
 
@@ -599,7 +594,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
 				      struct net *net, void *data)
 {
 	unsigned int *timeouts = data;
-	struct nf_sctp_net *sn = sctp_pernet(net);
+	struct nf_sctp_net *sn = nf_sctp_pernet(net);
 	int i;
 
 	/* set default SCTP timeouts. */
@@ -736,7 +731,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
 
 static int sctp_init_net(struct net *net)
 {
-	struct nf_sctp_net *sn = sctp_pernet(net);
+	struct nf_sctp_net *sn = nf_sctp_pernet(net);
 	struct nf_proto_net *pn = &sn->pn;
 
 	if (!pn->users) {
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 1bcf9984d45e..4dcbd51a8e97 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -272,11 +272,6 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
 	}
 };
 
-static inline struct nf_tcp_net *tcp_pernet(struct net *net)
-{
-	return &net->ct.nf_ct_proto.tcp;
-}
-
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 /* Print out the private part of the conntrack. */
 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -475,7 +470,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			  const struct tcphdr *tcph)
 {
 	struct net *net = nf_ct_net(ct);
-	struct nf_tcp_net *tn = tcp_pernet(net);
+	struct nf_tcp_net *tn = nf_tcp_pernet(net);
 	struct ip_ct_tcp_state *sender = &state->seen[dir];
 	struct ip_ct_tcp_state *receiver = &state->seen[!dir];
 	const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
@@ -767,7 +762,7 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 {
 	enum tcp_conntrack new_state;
 	struct net *net = nf_ct_net(ct);
-	const struct nf_tcp_net *tn = tcp_pernet(net);
+	const struct nf_tcp_net *tn = nf_tcp_pernet(net);
 	const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
 	const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
 
@@ -841,7 +836,7 @@ static int tcp_packet(struct nf_conn *ct,
 		      const struct nf_hook_state *state)
 {
 	struct net *net = nf_ct_net(ct);
-	struct nf_tcp_net *tn = tcp_pernet(net);
+	struct nf_tcp_net *tn = nf_tcp_pernet(net);
 	struct nf_conntrack_tuple *tuple;
 	enum tcp_conntrack new_state, old_state;
 	unsigned int index, *timeouts;
@@ -1283,7 +1278,7 @@ static unsigned int tcp_nlattr_tuple_size(void)
 static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
 				     struct net *net, void *data)
 {
-	struct nf_tcp_net *tn = tcp_pernet(net);
+	struct nf_tcp_net *tn = nf_tcp_pernet(net);
 	unsigned int *timeouts = data;
 	int i;
 
@@ -1508,7 +1503,7 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
 
 static int tcp_init_net(struct net *net)
 {
-	struct nf_tcp_net *tn = tcp_pernet(net);
+	struct nf_tcp_net *tn = nf_tcp_pernet(net);
 	struct nf_proto_net *pn = &tn->pn;
 
 	if (!pn->users) {
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index a7aa70370913..c879d8d78cfd 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -32,14 +32,9 @@ static const unsigned int udp_timeouts[UDP_CT_MAX] = {
 	[UDP_CT_REPLIED]	= 180*HZ,
 };
 
-static inline struct nf_udp_net *udp_pernet(struct net *net)
-{
-	return &net->ct.nf_ct_proto.udp;
-}
-
 static unsigned int *udp_get_timeouts(struct net *net)
 {
-	return udp_pernet(net)->timeouts;
+	return nf_udp_pernet(net)->timeouts;
 }
 
 static void udp_error_log(const struct sk_buff *skb,
@@ -212,7 +207,7 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[],
 				     struct net *net, void *data)
 {
 	unsigned int *timeouts = data;
-	struct nf_udp_net *un = udp_pernet(net);
+	struct nf_udp_net *un = nf_udp_pernet(net);
 
 	if (!timeouts)
 		timeouts = un->timeouts;
@@ -292,7 +287,7 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
 
 static int udp_init_net(struct net *net)
 {
-	struct nf_udp_net *un = udp_pernet(net);
+	struct nf_udp_net *un = nf_udp_pernet(net);
 	struct nf_proto_net *pn = &un->pn;
 
 	if (!pn->users) {
-- 
cgit 


From 8866df9264a34e675b4ee8a151db819b87cce2d3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 2 Nov 2018 00:14:00 +0100
Subject: netfilter: nfnetlink_cttimeout: pass default timeout policy to
 obj_to_nlattr

Otherwise, we hit a NULL pointer deference since handlers always assume
default timeout policy is passed.

  netlink: 24 bytes leftover after parsing attributes in process `syz-executor2'.
  kasan: CONFIG_KASAN_INLINE enabled
  kasan: GPF could be caused by NULL-ptr deref or user memory access
  general protection fault: 0000 [#1] PREEMPT SMP KASAN
  CPU: 0 PID: 9575 Comm: syz-executor1 Not tainted 4.19.0+ #312
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  RIP: 0010:icmp_timeout_obj_to_nlattr+0x77/0x170 net/netfilter/nf_conntrack_proto_icmp.c:297

Fixes: c779e849608a ("netfilter: conntrack: remove get_timeout() indirection")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_cttimeout.c | 47 +++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index e7a50af1b3d6..a518eb162344 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -382,7 +382,8 @@ err:
 static int
 cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
 			    u32 seq, u32 type, int event, u16 l3num,
-			    const struct nf_conntrack_l4proto *l4proto)
+			    const struct nf_conntrack_l4proto *l4proto,
+			    const unsigned int *timeouts)
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
@@ -408,7 +409,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
 	if (!nest_parms)
 		goto nla_put_failure;
 
-	ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL);
+	ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts);
 	if (ret < 0)
 		goto nla_put_failure;
 
@@ -430,6 +431,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
 				 struct netlink_ext_ack *extack)
 {
 	const struct nf_conntrack_l4proto *l4proto;
+	unsigned int *timeouts = NULL;
 	struct sk_buff *skb2;
 	int ret, err;
 	__u16 l3num;
@@ -442,12 +444,44 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
 	l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
 	l4proto = nf_ct_l4proto_find_get(l4num);
 
-	/* This protocol is not supported, skip. */
-	if (l4proto->l4proto != l4num) {
-		err = -EOPNOTSUPP;
+	err = -EOPNOTSUPP;
+	if (l4proto->l4proto != l4num)
 		goto err;
+
+	switch (l4proto->l4proto) {
+	case IPPROTO_ICMP:
+		timeouts = &nf_icmp_pernet(net)->timeout;
+		break;
+	case IPPROTO_TCP:
+		timeouts = nf_tcp_pernet(net)->timeouts;
+		break;
+	case IPPROTO_UDP:
+		timeouts = nf_udp_pernet(net)->timeouts;
+		break;
+	case IPPROTO_DCCP:
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+		timeouts = nf_dccp_pernet(net)->dccp_timeout;
+#endif
+		break;
+	case IPPROTO_ICMPV6:
+		timeouts = &nf_icmpv6_pernet(net)->timeout;
+		break;
+	case IPPROTO_SCTP:
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+		timeouts = nf_sctp_pernet(net)->timeouts;
+#endif
+		break;
+	case 255:
+		timeouts = &nf_generic_pernet(net)->timeout;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
 	}
 
+	if (!timeouts)
+		goto err;
+
 	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (skb2 == NULL) {
 		err = -ENOMEM;
@@ -458,8 +492,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
 					  nlh->nlmsg_seq,
 					  NFNL_MSG_TYPE(nlh->nlmsg_type),
 					  IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
-					  l3num,
-					  l4proto);
+					  l3num, l4proto, timeouts);
 	if (ret <= 0) {
 		kfree_skb(skb2);
 		err = -ENOMEM;
-- 
cgit 


From e4844c9c62a0fe47980d6c3d4b7a096a5d755925 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 2 Nov 2018 11:33:37 +0100
Subject: netfilter: nft_compat: ebtables 'nat' table is normal chain type

Unlike ip(6)tables, the ebtables nat table has no special properties.
This bug causes 'ebtables -A' to fail when using a target such as
'snat' (ebt_snat target sets ".table = "nat"').  Targets that have
no table restrictions work fine.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 768292eac2a4..9d0ede474224 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -54,9 +54,11 @@ static bool nft_xt_put(struct nft_xt *xt)
 	return false;
 }
 
-static int nft_compat_chain_validate_dependency(const char *tablename,
-						const struct nft_chain *chain)
+static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx,
+						const char *tablename)
 {
+	enum nft_chain_types type = NFT_CHAIN_T_DEFAULT;
+	const struct nft_chain *chain = ctx->chain;
 	const struct nft_base_chain *basechain;
 
 	if (!tablename ||
@@ -64,9 +66,12 @@ static int nft_compat_chain_validate_dependency(const char *tablename,
 		return 0;
 
 	basechain = nft_base_chain(chain);
-	if (strcmp(tablename, "nat") == 0 &&
-	    basechain->type->type != NFT_CHAIN_T_NAT)
-		return -EINVAL;
+	if (strcmp(tablename, "nat") == 0) {
+		if (ctx->family != NFPROTO_BRIDGE)
+			type = NFT_CHAIN_T_NAT;
+		if (basechain->type->type != type)
+			return -EINVAL;
+	}
 
 	return 0;
 }
@@ -342,8 +347,7 @@ static int nft_target_validate(const struct nft_ctx *ctx,
 		if (target->hooks && !(hook_mask & target->hooks))
 			return -EINVAL;
 
-		ret = nft_compat_chain_validate_dependency(target->table,
-							   ctx->chain);
+		ret = nft_compat_chain_validate_dependency(ctx, target->table);
 		if (ret < 0)
 			return ret;
 	}
@@ -590,8 +594,7 @@ static int nft_match_validate(const struct nft_ctx *ctx,
 		if (match->hooks && !(hook_mask & match->hooks))
 			return -EINVAL;
 
-		ret = nft_compat_chain_validate_dependency(match->table,
-							   ctx->chain);
+		ret = nft_compat_chain_validate_dependency(ctx, match->table);
 		if (ret < 0)
 			return ret;
 	}
-- 
cgit 


From f393808dc64149ccd0e5a8427505ba2974a59854 Mon Sep 17 00:00:00 2001
From: Vasily Khoruzhick <vasilykh@arista.com>
Date: Thu, 25 Oct 2018 12:15:43 -0700
Subject: netfilter: conntrack: fix calculation of next bucket number in
 early_drop

If there's no entry to drop in bucket that corresponds to the hash,
early_drop() should look for it in other buckets. But since it increments
hash instead of bucket number, it actually looks in the same bucket 8
times: hsize is 16k by default (14 bits) and hash is 32-bit value, so
reciprocal_scale(hash, hsize) returns the same value for hash..hash+7 in
most cases.

Fix it by increasing bucket number instead of hash and rename _hash
to bucket to avoid future confusion.

Fixes: 3e86638e9a0b ("netfilter: conntrack: consider ct netns in early_drop logic")
Cc: <stable@vger.kernel.org> # v4.7+
Signed-off-by: Vasily Khoruzhick <vasilykh@arista.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ca1168d67fac..e92e749aff53 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1073,19 +1073,22 @@ static unsigned int early_drop_list(struct net *net,
 	return drops;
 }
 
-static noinline int early_drop(struct net *net, unsigned int _hash)
+static noinline int early_drop(struct net *net, unsigned int hash)
 {
-	unsigned int i;
+	unsigned int i, bucket;
 
 	for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
 		struct hlist_nulls_head *ct_hash;
-		unsigned int hash, hsize, drops;
+		unsigned int hsize, drops;
 
 		rcu_read_lock();
 		nf_conntrack_get_ht(&ct_hash, &hsize);
-		hash = reciprocal_scale(_hash++, hsize);
+		if (!i)
+			bucket = reciprocal_scale(hash, hsize);
+		else
+			bucket = (bucket + 1) % hsize;
 
-		drops = early_drop_list(net, &ct_hash[hash]);
+		drops = early_drop_list(net, &ct_hash[bucket]);
 		rcu_read_unlock();
 
 		if (drops) {
-- 
cgit 


From fe60faa5063822f2d555f4f326c7dd72a60929bf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 31 Oct 2018 08:39:13 -0700
Subject: net: do not abort bulk send on BQL status

Before calling dev_hard_start_xmit(), upper layers tried
to cook optimal skb list based on BQL budget.

Problem is that GSO packets can end up comsuming more than
the BQL budget.

Breaking the loop is not useful, since requeued packets
are ahead of any packets still in the qdisc.

It is also more expensive, since next TX completion will
push these packets later, while skbs are not in cpu caches.

It is also a behavior difference with TSO packets, that can
break the BQL limit by a large amount.

Note that drivers should use __netdev_tx_sent_queue()
in order to have optimal xmit_more support, and avoid
useless atomic operations as shown in the following patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 77d43ae2a7bb..0ffcbdd55fa9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3272,7 +3272,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de
 		}
 
 		skb = next;
-		if (netif_xmit_stopped(txq) && skb) {
+		if (netif_tx_queue_stopped(txq) && skb) {
 			rc = NETDEV_TX_BUSY;
 			break;
 		}
-- 
cgit 


From a277d516de5f498c91d91189717ef7e01102ad27 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 2 Nov 2018 16:36:55 +0100
Subject: openvswitch: fix linking without CONFIG_NF_CONNTRACK_LABELS

When CONFIG_CC_OPTIMIZE_FOR_DEBUGGING is enabled, the compiler
fails to optimize out a dead code path, which leads to a link failure:

net/openvswitch/conntrack.o: In function `ovs_ct_set_labels':
conntrack.c:(.text+0x2e60): undefined reference to `nf_connlabels_replace'

In this configuration, we can take a shortcut, and completely
remove the contrack label code. This may also help the regular
optimization.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 6bec37ab4472..a4660c48ff01 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1203,7 +1203,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 					 &info->labels.mask);
 		if (err)
 			return err;
-	} else if (labels_nonzero(&info->labels.mask)) {
+	} else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+		   labels_nonzero(&info->labels.mask)) {
 		err = ovs_ct_set_labels(ct, key, &info->labels.value,
 					&info->labels.mask);
 		if (err)
-- 
cgit 


From 12480e3b16982c4026de10dd8155823219cd6391 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 3 Nov 2018 14:01:31 +0800
Subject: sctp: define SCTP_SS_DEFAULT for Stream schedulers

According to rfc8260#section-4.3.2, SCTP_SS_DEFAULT is required to
defined as SCTP_SS_FCFS or SCTP_SS_RR.

SCTP_SS_FCFS is used for SCTP_SS_DEFAULT's value in this patch.

Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations")
Reported-by: Jianwen Ji <jiji@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 9cb854b05342..c37e1c2dec9d 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -212,7 +212,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
 	INIT_LIST_HEAD(&q->retransmit);
 	INIT_LIST_HEAD(&q->sacked);
 	INIT_LIST_HEAD(&q->abandoned);
-	sctp_sched_set_sched(asoc, SCTP_SS_FCFS);
+	sctp_sched_set_sched(asoc, SCTP_SS_DEFAULT);
 }
 
 /* Free the outqueue structure and any related pending chunks.
-- 
cgit 


From 6915ed86cca6a0b422e8ada05ec7009d2074b88a Mon Sep 17 00:00:00 2001
From: Jeff Barnhill <0xeffeff@gmail.com>
Date: Mon, 5 Nov 2018 20:36:45 +0000
Subject: net/ipv6: Move anycast init/cleanup functions out of CONFIG_PROC_FS

Move the anycast.c init and cleanup functions which were inadvertently
added inside the CONFIG_PROC_FS definition.

Fixes: 2384d02520ff ("net/ipv6: Add anycast addresses to a global hashtable")
Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/anycast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 7698637cf827..94999058e110 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -590,6 +590,7 @@ void ac6_proc_exit(struct net *net)
 {
 	remove_proc_entry("anycast6", net->proc_net);
 }
+#endif
 
 /*	Init / cleanup code
  */
@@ -611,4 +612,3 @@ void ipv6_anycast_cleanup(void)
 		WARN_ON(!hlist_empty(&inet6_acaddr_lst[i]));
 	spin_unlock(&acaddr_hash_lock);
 }
-#endif
-- 
cgit 


From 5e1acb4afacc6229946c3d2b7ffc422b53fb2448 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Fri, 2 Nov 2018 19:11:04 +0300
Subject: rtnetlink: restore handling of dumpit return value in rtnl_dump_all()

For non-zero return from dumpit() we should break the loop
in rtnl_dump_all() and return the result. Otherwise, e.g.,
we could get the memory leak in inet6_dump_fib() [1]. The
pointer to the allocated struct fib6_walker there (saved
in cb->args) can be lost, reset on the next iteration.

Fix it by partially restoring the previous behavior before
commit c63586dc9b3e ("net: rtnl_dump_all needs to propagate
error from dumpit function"). The returned error from
dumpit() is still passed further.

[1]:
unreferenced object 0xffff88001322a200 (size 96):
  comm "sshd", pid 1484, jiffies 4296032768 (age 1432.542s)
  hex dump (first 32 bytes):
    00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de  ................
    18 09 41 36 00 88 ff ff 18 09 41 36 00 88 ff ff  ..A6......A6....
  backtrace:
    [<0000000095846b39>] kmem_cache_alloc_trace+0x151/0x220
    [<000000007d12709f>] inet6_dump_fib+0x68d/0x940
    [<000000002775a316>] rtnl_dump_all+0x1d9/0x2d0
    [<00000000d7cd302b>] netlink_dump+0x945/0x11a0
    [<000000002f43485f>] __netlink_dump_start+0x55d/0x800
    [<00000000f76bbeec>] rtnetlink_rcv_msg+0x4fa/0xa00
    [<000000009b5761f3>] netlink_rcv_skb+0x29c/0x420
    [<0000000087a1dae1>] rtnetlink_rcv+0x15/0x20
    [<00000000691b703b>] netlink_unicast+0x4e3/0x6c0
    [<00000000b5be0204>] netlink_sendmsg+0x7f2/0xba0
    [<0000000096d2aa60>] sock_sendmsg+0xba/0xf0
    [<000000008c1b786f>] __sys_sendto+0x1e4/0x330
    [<0000000019587b3f>] __x64_sys_sendto+0xe1/0x1a0
    [<00000000071f4d56>] do_syscall_64+0x9f/0x300
    [<000000002737577f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
    [<0000000057587684>] 0xffffffffffffffff

Fixes: c63586dc9b3e ("net: rtnl_dump_all needs to propagate error from dumpit function")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e01274bd5e3e..33d9227a8b80 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3367,7 +3367,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 			cb->seq = 0;
 		}
 		ret = dumpit(skb, cb);
-		if (ret < 0)
+		if (ret)
 			break;
 	}
 	cb->family = idx;
-- 
cgit 


From e22d0bfa09a56e427a1a950ccb85621c86b343a4 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Fri, 2 Nov 2018 19:11:05 +0300
Subject: ipv6: properly check return value in inet6_dump_all()

Make sure we call fib6_dump_end() if it happens that skb->len
is zero. rtnl_dump_all() can reset cb->args on the next loop
iteration there.

Fixes: 08e814c9e8eb ("net/ipv6: Bail early if user only wants cloned entries")
Fixes: ae677bbb4441 ("net: Don't return invalid table id error when dumping all families")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1b8bc008b53b..ae3786132c23 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -591,7 +591,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 
 	/* fib entries are never clones */
 	if (arg.filter.flags & RTM_F_CLONED)
-		return skb->len;
+		goto out;
 
 	w = (void *)cb->args[2];
 	if (!w) {
@@ -621,7 +621,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		tb = fib6_get_table(net, arg.filter.table_id);
 		if (!tb) {
 			if (arg.filter.dump_all_families)
-				return skb->len;
+				goto out;
 
 			NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
 			return -ENOENT;
-- 
cgit 


From d016b4a3562b745fd9fa387a47d8de62ccd7e241 Mon Sep 17 00:00:00 2001
From: "Matwey V. Kornilov" <matwey@sai.msu.ru>
Date: Fri, 2 Nov 2018 21:19:36 +0300
Subject: net: core: netpoll: Enable netconsole IPv6 link local address

There is no reason to discard using source link local address when
remote netconsole IPv6 address is set to be link local one.

The patch allows administrators to use IPv6 netconsole without
explicitly configuring source address:

    netconsole=@/,@fe80::5054:ff:fe2f:6012/

Signed-off-by: Matwey V. Kornilov <matwey@sai.msu.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 5da9552b186b..2b9fdbc43205 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -717,7 +717,8 @@ int netpoll_setup(struct netpoll *np)
 
 				read_lock_bh(&idev->lock);
 				list_for_each_entry(ifp, &idev->addr_list, if_list) {
-					if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)
+					if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) !=
+					    !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL))
 						continue;
 					np->local_ip.in6 = ifp->addr;
 					err = 0;
-- 
cgit 


From c34c1287778b080ed692c0a46a8e345206cc29e6 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Sun, 4 Nov 2018 22:37:15 -0800
Subject: sock_diag: fix autoloading of the raw_diag module

IPPROTO_RAW isn't registred as an inet protocol, so
inet_protos[protocol] is always NULL for it.

Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Xin Long <lucien.xin@gmail.com>
Fixes: bf2ae2e4bf93 ("sock_diag: request _diag module only when the family or proto has been registered")
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 6fcc4bc07d19..080a880a1761 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3279,6 +3279,7 @@ int sock_load_diag_module(int family, int protocol)
 
 #ifdef CONFIG_INET
 	if (family == AF_INET &&
+	    protocol != IPPROTO_RAW &&
 	    !rcu_access_pointer(inet_protos[protocol]))
 		return -ENOENT;
 #endif
-- 
cgit 


From 97adaddaa6db7a8af81b9b11e30cbe3628cd6700 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Nov 2018 22:31:41 +0900
Subject: net: bpfilter: fix iptables failure if bpfilter_umh is disabled

When iptables command is executed, ip_{set/get}sockopt() try to upload
bpfilter.ko if bpfilter is enabled. if it couldn't find bpfilter.ko,
command is failed.
bpfilter.ko is generated if CONFIG_BPFILTER_UMH is enabled.
ip_{set/get}sockopt() only checks CONFIG_BPFILTER.
So that if CONFIG_BPFILTER is enabled and CONFIG_BPFILTER_UMH is disabled,
iptables command is always failed.

test config:
   CONFIG_BPFILTER=y
   # CONFIG_BPFILTER_UMH is not set

test command:
   %iptables -L
   iptables: No chain/target/match by that name.

Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 26c36cccabdc..fffcc130900e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1246,7 +1246,7 @@ int ip_setsockopt(struct sock *sk, int level,
 		return -ENOPROTOOPT;
 
 	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
-#ifdef CONFIG_BPFILTER
+#if IS_ENABLED(CONFIG_BPFILTER_UMH)
 	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
 	    optname < BPFILTER_IPT_SET_MAX)
 		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
@@ -1559,7 +1559,7 @@ int ip_getsockopt(struct sock *sk, int level,
 	int err;
 
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
-#ifdef CONFIG_BPFILTER
+#if IS_ENABLED(CONFIG_BPFILTER_UMH)
 	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
 	    optname < BPFILTER_IPT_GET_MAX)
 		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
@@ -1596,7 +1596,7 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen,
 		MSG_CMSG_COMPAT);
 
-#ifdef CONFIG_BPFILTER
+#if IS_ENABLED(CONFIG_BPFILTER_UMH)
 	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
 	    optname < BPFILTER_IPT_GET_MAX)
 		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
-- 
cgit 


From 0d5b9311baf27bb545f187f12ecfd558220c607d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Nov 2018 17:34:27 -0800
Subject: inet: frags: better deal with smp races
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multiple cpus might attempt to insert a new fragment in rhashtable,
if for example RPS is buggy, as reported by 배석진 in
https://patchwork.ozlabs.org/patch/994601/

We use rhashtable_lookup_get_insert_key() instead of
rhashtable_insert_fast() to let cpus losing the race
free their own inet_frag_queue and use the one that
was inserted by another cpu.

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: 배석진 <soukjin.bae@samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index bcb11f3a27c0..760a9e52e02b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -178,21 +178,22 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 }
 
 static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
-						void *arg)
+						void *arg,
+						struct inet_frag_queue **prev)
 {
 	struct inet_frags *f = nf->f;
 	struct inet_frag_queue *q;
-	int err;
 
 	q = inet_frag_alloc(nf, f, arg);
-	if (!q)
+	if (!q) {
+		*prev = ERR_PTR(-ENOMEM);
 		return NULL;
-
+	}
 	mod_timer(&q->timer, jiffies + nf->timeout);
 
-	err = rhashtable_insert_fast(&nf->rhashtable, &q->node,
-				     f->rhash_params);
-	if (err < 0) {
+	*prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
+						 &q->node, f->rhash_params);
+	if (*prev) {
 		q->flags |= INET_FRAG_COMPLETE;
 		inet_frag_kill(q);
 		inet_frag_destroy(q);
@@ -204,22 +205,22 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
 {
-	struct inet_frag_queue *fq;
+	struct inet_frag_queue *fq = NULL, *prev;
 
 	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
 		return NULL;
 
 	rcu_read_lock();
 
-	fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
-	if (fq) {
+	prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
+	if (!prev)
+		fq = inet_frag_create(nf, key, &prev);
+	if (prev && !IS_ERR(prev)) {
+		fq = prev;
 		if (!refcount_inc_not_zero(&fq->refcnt))
 			fq = NULL;
-		rcu_read_unlock();
-		return fq;
 	}
 	rcu_read_unlock();
-
-	return inet_frag_create(nf, key);
+	return fq;
 }
 EXPORT_SYMBOL(inet_frag_find);
-- 
cgit 


From 62230715fd2453b3ba948c9d83cfb3ada9169169 Mon Sep 17 00:00:00 2001
From: 배석진 <soukjin.bae@samsung.com>
Date: Fri, 9 Nov 2018 16:53:06 -0800
Subject: flow_dissector: do not dissect l4 ports for fragments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only first fragment has the sport/dport information,
not the following ones.

If we want consistent hash for all fragments, we need to
ignore ports even for first fragment.

This bug is visible for IPv6 traffic, if incoming fragments
do not have a flow label, since skb_get_hash() will give
different results for first fragment and following ones.

It is also visible if any routing rule wants dissection
and sport or dport.

See commit 5e5d6fed3741 ("ipv6: route: dissect flow
in input path if fib rules need it") for details.

[edumazet] rewrote the changelog completely.

Fixes: 06635a35d13d ("flow_dissect: use programable dissector in skb_flow_dissect and friends")
Signed-off-by: 배석진 <soukjin.bae@samsung.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 676f3ad629f9..588f475019d4 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1166,8 +1166,8 @@ ip_proto_again:
 		break;
 	}
 
-	if (dissector_uses_key(flow_dissector,
-			       FLOW_DISSECTOR_KEY_PORTS)) {
+	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS) &&
+	    !(key_control->flags & FLOW_DIS_IS_FRAGMENT)) {
 		key_ports = skb_flow_dissector_target(flow_dissector,
 						      FLOW_DISSECTOR_KEY_PORTS,
 						      target_container);
-- 
cgit 


From 63c82997f5c0f3e1b914af43d82f712a86bc5f3a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 9 Nov 2018 21:06:26 -0800
Subject: net: sched: cls_flower: validate nested enc_opts_policy to avoid
 warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TCA_FLOWER_KEY_ENC_OPTS and TCA_FLOWER_KEY_ENC_OPTS_MASK can only
currently contain further nested attributes, which are parsed by
hand, so the policy is never actually used resulting in a W=1
build warning:

net/sched/cls_flower.c:492:1: warning: ‘enc_opts_policy’ defined but not used [-Wunused-const-variable=]
 enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {

Add the validation anyway to avoid potential bugs when other
attributes are added and to make the attribute structure slightly
more clear.  Validation will also set extact to point to bad
attribute on error.

Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9aada2d0ef06..c6c327874abc 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -709,11 +709,23 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 			  struct netlink_ext_ack *extack)
 {
 	const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL;
-	int option_len, key_depth, msk_depth = 0;
+	int err, option_len, key_depth, msk_depth = 0;
+
+	err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS],
+				  TCA_FLOWER_KEY_ENC_OPTS_MAX,
+				  enc_opts_policy, extack);
+	if (err)
+		return err;
 
 	nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]);
 
 	if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) {
+		err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK],
+					  TCA_FLOWER_KEY_ENC_OPTS_MAX,
+					  enc_opts_policy, extack);
+		if (err)
+			return err;
+
 		nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
 		msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
 	}
-- 
cgit 


From 7ab412d33b4c7ff3e0148d3db25dd861edd1283d Mon Sep 17 00:00:00 2001
From: Jon Maloy <donmalo99@gmail.com>
Date: Sat, 10 Nov 2018 17:30:24 -0500
Subject: tipc: fix link re-establish failure

When a link failure is detected locally, the link is reset, the flag
link->in_session is set to false, and a RESET_MSG with the 'stopping'
bit set is sent to the peer.

The purpose of this bit is to inform the peer that this endpoint just
is going down, and that the peer should handle the reception of this
particular RESET message as a local failure. This forces the peer to
accept another RESET or ACTIVATE message from this endpoint before it
can re-establish the link. This again is necessary to ensure that
link session numbers are properly exchanged before the link comes up
again.

If a failure is detected locally at the same time at the peer endpoint
this will do the same, which is also a correct behavior.

However, when receiving such messages, the endpoints will not
distinguish between 'stopping' RESETs and ordinary ones when it comes
to updating session numbers. Both endpoints will copy the received
session number and set their 'in_session' flags to true at the
reception, while they are still expecting another RESET from the
peer before they can go ahead and re-establish. This is contradictory,
since, after applying the validation check referred to below, the
'in_session' flag will cause rejection of all such messages, and the
link will never come up again.

We now fix this by not only handling received RESET/STOPPING messages
as a local failure, but also by omitting to set a new session number
and the 'in_session' flag in such cases.

Fixes: 7ea817f4e832 ("tipc: check session number before accepting link protocol messages")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 201c3b5bc96b..836727e363c4 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1594,14 +1594,17 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 		if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI))
 			l->priority = peers_prio;
 
-		/* ACTIVATE_MSG serves as PEER_RESET if link is already down */
-		if (msg_peer_stopping(hdr))
+		/* If peer is going down we want full re-establish cycle */
+		if (msg_peer_stopping(hdr)) {
 			rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
-		else if ((mtyp == RESET_MSG) || !link_is_up(l))
+			break;
+		}
+		/* ACTIVATE_MSG serves as PEER_RESET if link is already down */
+		if (mtyp == RESET_MSG || !link_is_up(l))
 			rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT);
 
 		/* ACTIVATE_MSG takes up link if it was already locally reset */
-		if ((mtyp == ACTIVATE_MSG) && (l->state == LINK_ESTABLISHING))
+		if (mtyp == ACTIVATE_MSG && l->state == LINK_ESTABLISHING)
 			rc = TIPC_LINK_UP_EVT;
 
 		l->peer_session = msg_session(hdr);
-- 
cgit 


From 7236ead1b14923f3ba35cd29cce13246be83f451 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 10 Nov 2018 16:22:29 -0800
Subject: act_mirred: clear skb->tstamp on redirect

If sch_fq is used at ingress, skbs that might have been
timestamped by net_timestamp_set() if a packet capture
is requesting timestamps could be delayed by arbitrary
amount of time, since sch_fq time base is MONOTONIC.

Fix this problem by moving code from sch_netem.c to act_mirred.c.

Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_mirred.c | 3 ++-
 net/sched/sch_netem.c  | 9 ---------
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 1dae5f2b358f..c8cf4d10c435 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -258,7 +258,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
 	if (is_redirect) {
 		skb2->tc_redirected = 1;
 		skb2->tc_from_ingress = skb2->tc_at_ingress;
-
+		if (skb2->tc_from_ingress)
+			skb2->tstamp = 0;
 		/* let's the caller reinsert the packet, if possible */
 		if (use_reinsert) {
 			res->ingress = want_ingress;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 57b3ad9394ad..2c38e3d07924 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -648,15 +648,6 @@ deliver:
 			 */
 			skb->dev = qdisc_dev(sch);
 
-#ifdef CONFIG_NET_CLS_ACT
-			/*
-			 * If it's at ingress let's pretend the delay is
-			 * from the network (tstamp will be updated).
-			 */
-			if (skb->tc_redirected && skb->tc_from_ingress)
-				skb->tstamp = 0;
-#endif
-
 			if (q->slot.slot_next) {
 				q->slot.packets_left--;
 				q->slot.bytes_left -= qdisc_pkt_len(skb);
-- 
cgit