From 61792b677415b77c8db04991c22966bb8de7603e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Oct 2018 16:47:16 +0200 Subject: netfilter: ipv6: fix oops when defragmenting locally generated fragments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unlike ipv4 and normal ipv6 defrag, netfilter ipv6 defragmentation did not save/restore skb->dst. This causes oops when handling locally generated ipv6 fragments, as output path needs a valid dst. Reported-by: Maciej Żenczykowski Fixes: 84379c9afe01 ("netfilter: ipv6: nf_defrag: drop skb dst before queueing") Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b8ac369f98ad..d219979c3e52 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -587,11 +587,16 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) */ ret = -EINPROGRESS; if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - fq->q.meat == fq->q.len && - nf_ct_frag6_reasm(fq, skb, dev)) - ret = 0; - else + fq->q.meat == fq->q.len) { + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + if (nf_ct_frag6_reasm(fq, skb, dev)) + ret = 0; + skb->_skb_refdst = orefdst; + } else { skb_dst_drop(skb); + } out_unlock: spin_unlock_bh(&fq->q.lock); -- cgit From 5e91c9d9cd3fd557226ca75fed58816b9eee7e07 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 Oct 2018 21:49:45 +0200 Subject: netfilter: nft_osf: check if attribute is present If the attribute is not sent, eg. old libnftnl binary, then tb[NFTA_OSF_TTL] is NULL and kernel crashes from the _init path. Fixes: a218dc82f0b5 ("netfilter: nft_osf: Add ttl option support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_osf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index ca5e5d8c5ef8..b13618c764ec 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -50,7 +50,7 @@ static int nft_osf_init(const struct nft_ctx *ctx, int err; u8 ttl; - if (nla_get_u8(tb[NFTA_OSF_TTL])) { + if (tb[NFTA_OSF_TTL]) { ttl = nla_get_u8(tb[NFTA_OSF_TTL]); if (ttl > 2) return -EINVAL; -- cgit From 4269fea768a11a447d8de620ce420f2214d4685c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Oct 2018 11:14:28 +0200 Subject: Revert "netfilter: nft_numgen: add map lookups for numgen random operations" Laura found a better way to do this from userspace without requiring kernel infrastructure, revert this. Fixes: 978d8f9055c3 ("netfilter: nft_numgen: add map lookups for numgen random operations") Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_numgen.c | 127 --------------------------------------------- 1 file changed, 127 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 649d1700ec5b..3cc1b3dc3c3c 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -24,7 +24,6 @@ struct nft_ng_inc { u32 modulus; atomic_t counter; u32 offset; - struct nft_set *map; }; static u32 nft_ng_inc_gen(struct nft_ng_inc *priv) @@ -48,34 +47,11 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, regs->data[priv->dreg] = nft_ng_inc_gen(priv); } -static void nft_ng_inc_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_ng_inc *priv = nft_expr_priv(expr); - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = nft_ng_inc_gen(priv); - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_DREG] = { .type = NLA_U32 }, [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, [NFTA_NG_OFFSET] = { .type = NLA_U32 }, - [NFTA_NG_SET_NAME] = { .type = NLA_STRING, - .len = NFT_SET_MAXNAMELEN - 1 }, - [NFTA_NG_SET_ID] = { .type = NLA_U32 }, }; static int nft_ng_inc_init(const struct nft_ctx *ctx, @@ -101,22 +77,6 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_inc_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_ng_inc *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_ng_inc_init(ctx, expr, tb); - - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_NG_SET_NAME], - tb[NFTA_NG_SET_ID], genmask); - - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, u32 modulus, enum nft_ng_types type, u32 offset) { @@ -143,27 +103,10 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } -static int nft_ng_inc_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_ng_inc *priv = nft_expr_priv(expr); - - if (nft_ng_dump(skb, priv->dreg, priv->modulus, - NFT_NG_INCREMENTAL, priv->offset) || - nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -1; -} - struct nft_ng_random { enum nft_registers dreg:8; u32 modulus; u32 offset; - struct nft_set *map; }; static u32 nft_ng_random_gen(struct nft_ng_random *priv) @@ -183,25 +126,6 @@ static void nft_ng_random_eval(const struct nft_expr *expr, regs->data[priv->dreg] = nft_ng_random_gen(priv); } -static void nft_ng_random_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_ng_random *priv = nft_expr_priv(expr); - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = nft_ng_random_gen(priv); - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - static int nft_ng_random_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -226,21 +150,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_random_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_ng_random *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_ng_random_init(ctx, expr, tb); - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_NG_SET_NAME], - tb[NFTA_NG_SET_ID], genmask); - - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_random *priv = nft_expr_priv(expr); @@ -249,22 +158,6 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } -static int nft_ng_random_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_ng_random *priv = nft_expr_priv(expr); - - if (nft_ng_dump(skb, priv->dreg, priv->modulus, - NFT_NG_RANDOM, priv->offset) || - nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -1; -} - static struct nft_expr_type nft_ng_type; static const struct nft_expr_ops nft_ng_inc_ops = { .type = &nft_ng_type, @@ -274,14 +167,6 @@ static const struct nft_expr_ops nft_ng_inc_ops = { .dump = nft_ng_inc_dump, }; -static const struct nft_expr_ops nft_ng_inc_map_ops = { - .type = &nft_ng_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), - .eval = nft_ng_inc_map_eval, - .init = nft_ng_inc_map_init, - .dump = nft_ng_inc_map_dump, -}; - static const struct nft_expr_ops nft_ng_random_ops = { .type = &nft_ng_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), @@ -290,14 +175,6 @@ static const struct nft_expr_ops nft_ng_random_ops = { .dump = nft_ng_random_dump, }; -static const struct nft_expr_ops nft_ng_random_map_ops = { - .type = &nft_ng_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), - .eval = nft_ng_random_map_eval, - .init = nft_ng_random_map_init, - .dump = nft_ng_random_map_dump, -}; - static const struct nft_expr_ops * nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { @@ -312,12 +189,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) switch (type) { case NFT_NG_INCREMENTAL: - if (tb[NFTA_NG_SET_NAME]) - return &nft_ng_inc_map_ops; return &nft_ng_inc_ops; case NFT_NG_RANDOM: - if (tb[NFTA_NG_SET_NAME]) - return &nft_ng_random_map_ops; return &nft_ng_random_ops; } -- cgit From 439cd39ea136d2c026805264d58a91f36b6b64ca Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sat, 14 Jul 2018 21:59:43 +0200 Subject: netfilter: ipset: list:set: Decrease refcount synchronously on deletion and replace Commit 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel") postponed decreasing set reference counters to the RCU callback. An 'ipset del' command can terminate before the RCU grace period is elapsed, and if sets are listed before then, the reference counter shown in userspace will be wrong: # ipset create h hash:ip; ipset create l list:set; ipset add l # ipset del l h; ipset list h Name: h Type: hash:ip Revision: 4 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 88 References: 1 Number of entries: 0 Members: # sleep 1; ipset list h Name: h Type: hash:ip Revision: 4 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 88 References: 0 Number of entries: 0 Members: Fix this by making the reference count update synchronous again. As a result, when sets are listed, ip_set_name_byindex() might now fetch a set whose reference count is already zero. Instead of relying on the reference count to protect against concurrent set renaming, grab ip_set_ref_lock as reader and copy the name, while holding the same lock in ip_set_rename() as writer instead. Reported-by: Li Shuang Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel") Signed-off-by: Stefano Brivio Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 23 +++++++++++------------ net/netfilter/ipset/ip_set_list_set.c | 17 +++++++++++------ 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index bc4bd247bb7d..fa15a831aeee 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -693,21 +693,20 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index) EXPORT_SYMBOL_GPL(ip_set_put_byindex); /* Get the name of a set behind a set index. - * We assume the set is referenced, so it does exist and - * can't be destroyed. The set cannot be renamed due to - * the referencing either. - * + * Set itself is protected by RCU, but its name isn't: to protect against + * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the + * name. */ -const char * -ip_set_name_byindex(struct net *net, ip_set_id_t index) +void +ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) { - const struct ip_set *set = ip_set_rcu_get(net, index); + struct ip_set *set = ip_set_rcu_get(net, index); BUG_ON(!set); - BUG_ON(set->ref == 0); - /* Referenced, so it's safe */ - return set->name; + read_lock_bh(&ip_set_ref_lock); + strncpy(name, set->name, IPSET_MAXNAMELEN); + read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); @@ -1153,7 +1152,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, if (!set) return -ENOENT; - read_lock_bh(&ip_set_ref_lock); + write_lock_bh(&ip_set_ref_lock); if (set->ref != 0) { ret = -IPSET_ERR_REFERENCED; goto out; @@ -1170,7 +1169,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, strncpy(set->name, name2, IPSET_MAXNAMELEN); out: - read_unlock_bh(&ip_set_ref_lock); + write_unlock_bh(&ip_set_ref_lock); return ret; } diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 072a658fde04..4eef55da0878 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -148,9 +148,7 @@ __list_set_del_rcu(struct rcu_head * rcu) { struct set_elem *e = container_of(rcu, struct set_elem, rcu); struct ip_set *set = e->set; - struct list_set *map = set->data; - ip_set_put_byindex(map->net, e->id); ip_set_ext_destroy(set, e); kfree(e); } @@ -158,15 +156,21 @@ __list_set_del_rcu(struct rcu_head * rcu) static inline void list_set_del(struct ip_set *set, struct set_elem *e) { + struct list_set *map = set->data; + set->elements--; list_del_rcu(&e->list); + ip_set_put_byindex(map->net, e->id); call_rcu(&e->rcu, __list_set_del_rcu); } static inline void -list_set_replace(struct set_elem *e, struct set_elem *old) +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { + struct list_set *map = set->data; + list_replace_rcu(&old->list, &e->list); + ip_set_put_byindex(map->net, old->id); call_rcu(&old->rcu, __list_set_del_rcu); } @@ -298,7 +302,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, INIT_LIST_HEAD(&e->list); list_set_init_extensions(set, ext, e); if (n) - list_set_replace(e, n); + list_set_replace(set, e, n); else if (next) list_add_tail_rcu(&e->list, &next->list); else if (prev) @@ -486,6 +490,7 @@ list_set_list(const struct ip_set *set, const struct list_set *map = set->data; struct nlattr *atd, *nested; u32 i = 0, first = cb->args[IPSET_CB_ARG0]; + char name[IPSET_MAXNAMELEN]; struct set_elem *e; int ret = 0; @@ -504,8 +509,8 @@ list_set_list(const struct ip_set *set, nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; - if (nla_put_string(skb, IPSET_ATTR_NAME, - ip_set_name_byindex(map->net, e->id))) + ip_set_name_byindex(map->net, e->id, name); + if (nla_put_string(skb, IPSET_ATTR_NAME, name)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; -- cgit From 886503f34d63e681662057448819edb5b1057a97 Mon Sep 17 00:00:00 2001 From: Eric Westbrook Date: Tue, 28 Aug 2018 15:14:42 -0600 Subject: netfilter: ipset: actually allow allowable CIDR 0 in hash:net,port,net Allow /0 as advertised for hash:net,port,net sets. For "hash:net,port,net", ipset(8) says that "either subnet is permitted to be a /0 should you wish to match port between all destinations." Make that statement true. Before: # ipset create cidrzero hash:net,port,net # ipset add cidrzero 0.0.0.0/0,12345,0.0.0.0/0 ipset v6.34: The value of the CIDR parameter of the IP address is invalid # ipset create cidrzero6 hash:net,port,net family inet6 # ipset add cidrzero6 ::/0,12345,::/0 ipset v6.34: The value of the CIDR parameter of the IP address is invalid After: # ipset create cidrzero hash:net,port,net # ipset add cidrzero 0.0.0.0/0,12345,0.0.0.0/0 # ipset test cidrzero 192.168.205.129,12345,172.16.205.129 192.168.205.129,tcp:12345,172.16.205.129 is in set cidrzero. # ipset create cidrzero6 hash:net,port,net family inet6 # ipset add cidrzero6 ::/0,12345,::/0 # ipset test cidrzero6 fe80::1,12345,ff00::1 fe80::1,tcp:12345,ff00::1 is in set cidrzero6. See also: https://bugzilla.kernel.org/show_bug.cgi?id=200897 https://github.com/ewestbrook/linux/commit/df7ff6efb0934ab6acc11f003ff1a7580d6c1d9c Signed-off-by: Eric Westbrook Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_netportnet.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index d391485a6acd..613e18e720a4 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -213,13 +213,13 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + if (e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + if (e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } @@ -493,13 +493,13 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + if (e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + if (e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } -- cgit From ed956f3947a01ff9875cd908d7c1ef1fe7f47bf0 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 22 Oct 2018 23:30:40 +0200 Subject: netfilter: ipset: fix ip_set_list allocation failure ip_set_create() and ip_set_net_init() attempt to allocate physically contiguous memory for ip_set_list. If memory is fragmented, the allocations could easily fail: vzctl: page allocation failure: order:7, mode:0xc0d0 Call Trace: dump_stack+0x19/0x1b warn_alloc_failed+0x110/0x180 __alloc_pages_nodemask+0x7bf/0xc60 alloc_pages_current+0x98/0x110 kmalloc_order+0x18/0x40 kmalloc_order_trace+0x26/0xa0 __kmalloc+0x279/0x290 ip_set_net_init+0x4b/0x90 [ip_set] ops_init+0x3b/0xb0 setup_net+0xbb/0x170 copy_net_ns+0xf1/0x1c0 create_new_namespaces+0xf9/0x180 copy_namespaces+0x8e/0xd0 copy_process+0xb61/0x1a00 do_fork+0x91/0x320 Use kvcalloc() to fallback to 0-order allocations if high order page isn't available. Signed-off-by: Andrey Ryabinin Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index fa15a831aeee..68db946df151 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -960,7 +960,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Wraparound */ goto cleanup; - list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); + list = kvcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ @@ -972,7 +972,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Use new list */ index = inst->ip_set_max; inst->ip_set_max = i; - kfree(tmp); + kvfree(tmp); ret = 0; } else if (ret) { goto cleanup; @@ -2058,7 +2058,7 @@ ip_set_net_init(struct net *net) if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; - list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); + list = kvcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; inst->is_deleted = false; @@ -2086,7 +2086,7 @@ ip_set_net_exit(struct net *net) } } nfnl_unlock(NFNL_SUBSYS_IPSET); - kfree(rcu_dereference_protected(inst->ip_set_list, 1)); + kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { -- cgit From 7de414a9dd91426318df7b63da024b2b07e53df5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 1 Nov 2018 12:02:37 -0700 Subject: net: drop skb on failure in ip_check_defrag() Most callers of pskb_trim_rcsum() simply drop the skb when it fails, however, ip_check_defrag() still continues to pass the skb up to stack. This is suspicious. In ip_check_defrag(), after we learn the skb is an IP fragment, passing the skb to callers makes no sense, because callers expect fragments are defrag'ed on success. So, dropping the skb when we can't defrag it is reasonable. Note, prior to commit 88078d98d1bb, this is not a big problem as checksum will be fixed up anyway. After it, the checksum is not correct on failure. Found this during code review. Fixes: 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends") Cc: Eric Dumazet Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9b0158fa431f..d6ee343fdb86 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -722,10 +722,14 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { - if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) - return skb; - if (pskb_trim_rcsum(skb, netoff + len)) - return skb; + if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) { + kfree_skb(skb); + return NULL; + } + if (pskb_trim_rcsum(skb, netoff + len)) { + kfree_skb(skb); + return NULL; + } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(net, skb, user)) return NULL; -- cgit From 49682bfa1e0e448a711471a5db83be0df1fb39a2 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 31 Oct 2018 13:16:58 +0100 Subject: net: document skb parameter in function 'skb_gso_size_check' Remove kernel-doc warning: net/core/skbuff.c:4953: warning: Function parameter or member 'skb' not described in 'skb_gso_size_check' Signed-off-by: Mathieu Malaterre Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 946de0e24c87..b4ee5c8b928f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4944,6 +4944,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) * * This is a helper to do that correctly considering GSO_BY_FRAGS. * + * @skb: GSO skb + * * @seg_len: The segmented length (from skb_gso_*_seglen). In the * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. * -- cgit From 2384d02520ff2a916169b2fd85ea50e923ed56c2 Mon Sep 17 00:00:00 2001 From: Jeff Barnhill <0xeffeff@gmail.com> Date: Fri, 2 Nov 2018 20:23:57 +0000 Subject: net/ipv6: Add anycast addresses to a global hashtable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit icmp6_send() function is expensive on systems with a large number of interfaces. Every time it’s called, it has to verify that the source address does not correspond to an existing anycast address by looping through every device and every anycast address on the device. This can result in significant delays for a CPU when there are a large number of neighbors and ND timers are frequently timing out and calling neigh_invalidate(). Add anycast addresses to a global hashtable to allow quick searching for matching anycast addresses. This is based on inet6_addr_lst in addrconf.c. Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com> Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 5 ++++ net/ipv6/anycast.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3f4d61017a69..f0cd291034f0 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1001,6 +1001,9 @@ static int __init inet6_init(void) err = ip6_flowlabel_init(); if (err) goto ip6_flowlabel_fail; + err = ipv6_anycast_init(); + if (err) + goto ipv6_anycast_fail; err = addrconf_init(); if (err) goto addrconf_fail; @@ -1091,6 +1094,8 @@ ipv6_frag_fail: ipv6_exthdrs_fail: addrconf_cleanup(); addrconf_fail: + ipv6_anycast_cleanup(); +ipv6_anycast_fail: ip6_flowlabel_cleanup(); ip6_flowlabel_fail: ndisc_late_cleanup(); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 4e0ff7031edd..7698637cf827 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -44,8 +44,22 @@ #include +#define IN6_ADDR_HSIZE_SHIFT 8 +#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT) +/* anycast address hash table + */ +static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; +static DEFINE_SPINLOCK(acaddr_hash_lock); + static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); +static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr) +{ + u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + + return hash_32(val, IN6_ADDR_HSIZE_SHIFT); +} + /* * socket join an anycast group */ @@ -204,16 +218,39 @@ void ipv6_sock_ac_close(struct sock *sk) rtnl_unlock(); } +static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) +{ + unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr); + + spin_lock(&acaddr_hash_lock); + hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]); + spin_unlock(&acaddr_hash_lock); +} + +static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca) +{ + spin_lock(&acaddr_hash_lock); + hlist_del_init_rcu(&aca->aca_addr_lst); + spin_unlock(&acaddr_hash_lock); +} + static void aca_get(struct ifacaddr6 *aca) { refcount_inc(&aca->aca_refcnt); } +static void aca_free_rcu(struct rcu_head *h) +{ + struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu); + + fib6_info_release(aca->aca_rt); + kfree(aca); +} + static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) { - fib6_info_release(ac->aca_rt); - kfree(ac); + call_rcu(&ac->rcu, aca_free_rcu); } } @@ -229,6 +266,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, aca->aca_addr = *addr; fib6_info_hold(f6i); aca->aca_rt = f6i; + INIT_HLIST_NODE(&aca->aca_addr_lst); aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; @@ -285,6 +323,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) aca_get(aca); write_unlock_bh(&idev->lock); + ipv6_add_acaddr_hash(net, aca); + ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); @@ -325,6 +365,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) else idev->ac_list = aca->aca_next; write_unlock_bh(&idev->lock); + ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); @@ -352,6 +393,8 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) idev->ac_list = aca->aca_next; write_unlock_bh(&idev->lock); + ipv6_del_acaddr_hash(aca); + addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); @@ -390,17 +433,25 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr) { + unsigned int hash = inet6_acaddr_hash(net, addr); + struct net_device *nh_dev; + struct ifacaddr6 *aca; bool found = false; rcu_read_lock(); if (dev) found = ipv6_chk_acast_dev(dev, addr); else - for_each_netdev_rcu(net, dev) - if (ipv6_chk_acast_dev(dev, addr)) { + hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash], + aca_addr_lst) { + nh_dev = fib6_info_nh_dev(aca->aca_rt); + if (!nh_dev || !net_eq(dev_net(nh_dev), net)) + continue; + if (ipv6_addr_equal(&aca->aca_addr, addr)) { found = true; break; } + } rcu_read_unlock(); return found; } @@ -539,4 +590,25 @@ void ac6_proc_exit(struct net *net) { remove_proc_entry("anycast6", net->proc_net); } + +/* Init / cleanup code + */ +int __init ipv6_anycast_init(void) +{ + int i; + + for (i = 0; i < IN6_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet6_acaddr_lst[i]); + return 0; +} + +void ipv6_anycast_cleanup(void) +{ + int i; + + spin_lock(&acaddr_hash_lock); + for (i = 0; i < IN6_ADDR_HSIZE; i++) + WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); + spin_unlock(&acaddr_hash_lock); +} #endif -- cgit From c7e86acfcee30794dc99a0759924bf7b9d43f1ca Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 1 Nov 2018 13:39:53 +0000 Subject: rxrpc: Fix lockup due to no error backoff after ack transmit error If the network becomes (partially) unavailable, say by disabling IPv6, the background ACK transmission routine can get itself into a tizzy by proposing immediate ACK retransmission. Since we're in the call event processor, that happens immediately without returning to the workqueue manager. The condition should clear after a while when either the network comes back or the call times out. Fix this by: (1) When re-proposing an ACK on failed Tx, don't schedule it immediately. This will allow a certain amount of time to elapse before we try again. (2) Enforce a return to the workqueue manager after a certain number of iterations of the call processing loop. (3) Add a backoff delay that increases the delay on deferred ACKs by a jiffy per failed transmission to a limit of HZ. The backoff delay is cleared on a successful return from kernel_sendmsg(). (4) Cancel calls immediately if the opening sendmsg fails. The layer above can arrange retransmission or rotate to another server. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_event.c | 18 ++++++++++++++---- net/rxrpc/output.c | 35 +++++++++++++++++++++++++++++++---- 3 files changed, 46 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 382196e57a26..bc628acf4f4f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -611,6 +611,7 @@ struct rxrpc_call { * not hard-ACK'd packet follows this. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + u16 tx_backoff; /* Delay to insert due to Tx failure */ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS * is fixed, we keep these numbers in terms of segments (ie. DATA diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 8e7434e92097..468efc3660c0 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -123,6 +123,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, else ack_at = expiry; + ack_at += READ_ONCE(call->tx_backoff); ack_at += now; if (time_before(ack_at, call->ack_at)) { WRITE_ONCE(call->ack_at, ack_at); @@ -311,6 +312,7 @@ void rxrpc_process_call(struct work_struct *work) container_of(work, struct rxrpc_call, processor); rxrpc_serial_t *send_ack; unsigned long now, next, t; + unsigned int iterations = 0; rxrpc_see_call(call); @@ -319,6 +321,11 @@ void rxrpc_process_call(struct work_struct *work) call->debug_id, rxrpc_call_states[call->state], call->events); recheck_state: + /* Limit the number of times we do this before returning to the manager */ + iterations++; + if (iterations > 5) + goto requeue; + if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) { rxrpc_send_abort_packet(call); goto recheck_state; @@ -447,13 +454,16 @@ recheck_state: rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); /* other events may have been raised since we started checking */ - if (call->events && call->state < RXRPC_CALL_COMPLETE) { - __rxrpc_queue_call(call); - goto out; - } + if (call->events && call->state < RXRPC_CALL_COMPLETE) + goto requeue; out_put: rxrpc_put_call(call, rxrpc_call_put); out: _leave(""); + return; + +requeue: + __rxrpc_queue_call(call); + goto out; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 189418888839..736aa9281100 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -34,6 +34,21 @@ struct rxrpc_abort_buffer { static const char rxrpc_keepalive_string[] = ""; +/* + * Increase Tx backoff on transmission failure and clear it on success. + */ +static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret) +{ + if (ret < 0) { + u16 tx_backoff = READ_ONCE(call->tx_backoff); + + if (tx_backoff < HZ) + WRITE_ONCE(call->tx_backoff, tx_backoff + 1); + } else { + WRITE_ONCE(call->tx_backoff, 0); + } +} + /* * Arrange for a keepalive ping a certain time after we last transmitted. This * lets the far side know we're still interested in this call and helps keep @@ -210,6 +225,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, else trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr, rxrpc_tx_point_call_ack); + rxrpc_tx_backoff(call, ret); if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { @@ -218,7 +234,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), - true, true, + false, true, rxrpc_propose_ack_retry_tx); } else { spin_lock_bh(&call->lock); @@ -300,7 +316,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) else trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr, rxrpc_tx_point_call_abort); - + rxrpc_tx_backoff(call, ret); rxrpc_put_connection(conn); return ret; @@ -413,6 +429,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, else trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_nofrag); + rxrpc_tx_backoff(call, ret); if (ret == -EMSGSIZE) goto send_fragmentable; @@ -445,9 +462,18 @@ done: rxrpc_reduce_call_timer(call, expect_rx_by, nowj, rxrpc_timer_set_for_normal); } - } - rxrpc_set_keepalive(call); + rxrpc_set_keepalive(call); + } else { + /* Cancel the call if the initial transmission fails, + * particularly if that's due to network routing issues that + * aren't going away anytime soon. The layer above can arrange + * the retransmission. + */ + if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, + RX_USER_ABORT, ret); + } _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; @@ -506,6 +532,7 @@ send_fragmentable: else trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_frag); + rxrpc_tx_backoff(call, ret); up_write(&conn->params.local->defrag_sem); goto done; -- cgit From 54451f60c8fa061af9051a53be9786393947367c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 21 Oct 2018 00:00:08 +0900 Subject: netfilter: xt_IDLETIMER: add sysfs filename checking routine When IDLETIMER rule is added, sysfs file is created under /sys/class/xt_idletimer/timers/ But some label name shouldn't be used. ".", "..", "power", "uevent", "subsystem", etc... So that sysfs filename checking routine is needed. test commands: %iptables -I INPUT -j IDLETIMER --timeout 1 --label "power" splat looks like: [95765.423132] sysfs: cannot create duplicate filename '/devices/virtual/xt_idletimer/timers/power' [95765.433418] CPU: 0 PID: 8446 Comm: iptables Not tainted 4.19.0-rc6+ #20 [95765.449755] Call Trace: [95765.449755] dump_stack+0xc9/0x16b [95765.449755] ? show_regs_print_info+0x5/0x5 [95765.449755] sysfs_warn_dup+0x74/0x90 [95765.449755] sysfs_add_file_mode_ns+0x352/0x500 [95765.449755] sysfs_create_file_ns+0x179/0x270 [95765.449755] ? sysfs_add_file_mode_ns+0x500/0x500 [95765.449755] ? idletimer_tg_checkentry+0x3e5/0xb1b [xt_IDLETIMER] [95765.449755] ? rcu_read_lock_sched_held+0x114/0x130 [95765.449755] ? __kmalloc_track_caller+0x211/0x2b0 [95765.449755] ? memcpy+0x34/0x50 [95765.449755] idletimer_tg_checkentry+0x4e2/0xb1b [xt_IDLETIMER] [ ... ] Fixes: 0902b469bd25 ("netfilter: xtables: idletimer target implementation") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_IDLETIMER.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index c6acfc2d9c84..eb4cbd244c3d 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -114,6 +114,22 @@ static void idletimer_tg_expired(struct timer_list *t) schedule_work(&timer->work); } +static int idletimer_check_sysfs_name(const char *name, unsigned int size) +{ + int ret; + + ret = xt_check_proc_name(name, size); + if (ret < 0) + return ret; + + if (!strcmp(name, "power") || + !strcmp(name, "subsystem") || + !strcmp(name, "uevent")) + return -EINVAL; + + return 0; +} + static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; @@ -124,6 +140,10 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) goto out; } + ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); + if (ret < 0) + goto out_free_timer; + sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { -- cgit From 8a02bdd50b2ecb6d62121d2958d3ea186cc88ce7 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 30 Oct 2018 22:43:42 +0100 Subject: netfilter: ipset: Fix calling ip_set() macro at dumping The ip_set() macro is called when either ip_set_ref_lock held only or no lock/nfnl mutex is held at dumping. Take this into account properly. Also, use Pablo's suggestion to use rcu_dereference_raw(), the ref_netlink protects the set. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 68db946df151..1577f2f76060 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -55,11 +55,15 @@ MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); -/* When the nfnl mutex is held: */ +/* When the nfnl mutex or ip_set_ref_lock is held: */ #define ip_set_dereference(p) \ - rcu_dereference_protected(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ + lockdep_is_held(&ip_set_ref_lock)) #define ip_set(inst, id) \ ip_set_dereference((inst)->ip_set_list)[id] +#define ip_set_ref_netlink(inst,id) \ + rcu_dereference_raw((inst)->ip_set_list)[id] /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -1251,7 +1255,7 @@ ip_set_dump_done(struct netlink_callback *cb) struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; - struct ip_set *set = ip_set(inst, index); + struct ip_set *set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); @@ -1440,7 +1444,7 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { - set = ip_set(inst, index); + set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); -- cgit From a95a7774d51e13f9cf4b7285666829b68852f07a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 2 Nov 2018 00:11:34 +0100 Subject: netfilter: conntrack: add nf_{tcp,udp,sctp,icmp,dccp,icmpv6,generic}_pernet() Expose these functions to access conntrack protocol tracker netns area, nfnetlink_cttimeout needs this. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_dccp.c | 13 ++++--------- net/netfilter/nf_conntrack_proto_generic.c | 11 +++-------- net/netfilter/nf_conntrack_proto_icmp.c | 11 +++-------- net/netfilter/nf_conntrack_proto_icmpv6.c | 11 +++-------- net/netfilter/nf_conntrack_proto_sctp.c | 11 +++-------- net/netfilter/nf_conntrack_proto_tcp.c | 15 +++++---------- net/netfilter/nf_conntrack_proto_udp.c | 11 +++-------- 7 files changed, 24 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 171e9e122e5f..023c1445bc39 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -384,11 +384,6 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = }, }; -static inline struct nf_dccp_net *dccp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.dccp; -} - static noinline bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct dccp_hdr *dh) @@ -401,7 +396,7 @@ dccp_new(struct nf_conn *ct, const struct sk_buff *skb, state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; switch (state) { default: - dn = dccp_pernet(net); + dn = nf_dccp_pernet(net); if (dn->dccp_loose == 0) { msg = "not picking up existing connection "; goto out_invalid; @@ -568,7 +563,7 @@ static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb, timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) - timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout; + timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; @@ -681,7 +676,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_dccp_net *dn = dccp_pernet(net); + struct nf_dccp_net *dn = nf_dccp_pernet(net); unsigned int *timeouts = data; int i; @@ -814,7 +809,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, static int dccp_init_net(struct net *net) { - struct nf_dccp_net *dn = dccp_pernet(net); + struct nf_dccp_net *dn = nf_dccp_pernet(net); struct nf_proto_net *pn = &dn->pn; if (!pn->users) { diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e10e867e0b55..5da19d5fbc76 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -27,11 +27,6 @@ static bool nf_generic_should_process(u8 proto) } } -static inline struct nf_generic_net *generic_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.generic; -} - static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) @@ -58,7 +53,7 @@ static int generic_packet(struct nf_conn *ct, } if (!timeout) - timeout = &generic_pernet(nf_ct_net(ct))->timeout; + timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; @@ -72,7 +67,7 @@ static int generic_packet(struct nf_conn *ct, static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_generic_net *gn = generic_pernet(net); + struct nf_generic_net *gn = nf_generic_pernet(net); unsigned int *timeout = data; if (!timeout) @@ -138,7 +133,7 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, static int generic_init_net(struct net *net) { - struct nf_generic_net *gn = generic_pernet(net); + struct nf_generic_net *gn = nf_generic_pernet(net); struct nf_proto_net *pn = &gn->pn; gn->timeout = nf_ct_generic_timeout; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 3598520bd19b..de64d8a5fdfd 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -25,11 +25,6 @@ static const unsigned int nf_ct_icmp_timeout = 30*HZ; -static inline struct nf_icmp_net *icmp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp; -} - static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { @@ -103,7 +98,7 @@ static int icmp_packet(struct nf_conn *ct, } if (!timeout) - timeout = &icmp_pernet(nf_ct_net(ct))->timeout; + timeout = &nf_icmp_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; @@ -275,7 +270,7 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; - struct nf_icmp_net *in = icmp_pernet(net); + struct nf_icmp_net *in = nf_icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { if (!timeout) @@ -337,7 +332,7 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int icmp_init_net(struct net *net) { - struct nf_icmp_net *in = icmp_pernet(net); + struct nf_icmp_net *in = nf_icmp_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmp_timeout; diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 378618feed5d..a15eefb8e317 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -30,11 +30,6 @@ static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; -static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmpv6; -} - static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, @@ -87,7 +82,7 @@ static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, static unsigned int *icmpv6_get_timeouts(struct net *net) { - return &icmpv6_pernet(net)->timeout; + return &nf_icmpv6_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ @@ -286,7 +281,7 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; - struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_icmp_net *in = nf_icmpv6_pernet(net); if (!timeout) timeout = icmpv6_get_timeouts(net); @@ -348,7 +343,7 @@ static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, static int icmpv6_init_net(struct net *net) { - struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_icmp_net *in = nf_icmpv6_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmpv6_timeout; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 3d719d3eb9a3..d53e3e78f605 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -146,11 +146,6 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { } }; -static inline struct nf_sctp_net *sctp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.sctp; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -480,7 +475,7 @@ static int sctp_packet(struct nf_conn *ct, timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) - timeouts = sctp_pernet(nf_ct_net(ct))->timeouts; + timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); @@ -599,7 +594,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct nf_sctp_net *sn = sctp_pernet(net); + struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; /* set default SCTP timeouts. */ @@ -736,7 +731,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int sctp_init_net(struct net *net) { - struct nf_sctp_net *sn = sctp_pernet(net); + struct nf_sctp_net *sn = nf_sctp_pernet(net); struct nf_proto_net *pn = &sn->pn; if (!pn->users) { diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1bcf9984d45e..4dcbd51a8e97 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -272,11 +272,6 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { } }; -static inline struct nf_tcp_net *tcp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.tcp; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -475,7 +470,7 @@ static bool tcp_in_window(const struct nf_conn *ct, const struct tcphdr *tcph) { struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; @@ -767,7 +762,7 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, { enum tcp_conntrack new_state; struct net *net = nf_ct_net(ct); - const struct nf_tcp_net *tn = tcp_pernet(net); + const struct nf_tcp_net *tn = nf_tcp_pernet(net); const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; @@ -841,7 +836,7 @@ static int tcp_packet(struct nf_conn *ct, const struct nf_hook_state *state) { struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; @@ -1283,7 +1278,7 @@ static unsigned int tcp_nlattr_tuple_size(void) static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); unsigned int *timeouts = data; int i; @@ -1508,7 +1503,7 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int tcp_init_net(struct net *net) { - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct nf_proto_net *pn = &tn->pn; if (!pn->users) { diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index a7aa70370913..c879d8d78cfd 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -32,14 +32,9 @@ static const unsigned int udp_timeouts[UDP_CT_MAX] = { [UDP_CT_REPLIED] = 180*HZ, }; -static inline struct nf_udp_net *udp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.udp; -} - static unsigned int *udp_get_timeouts(struct net *net) { - return udp_pernet(net)->timeouts; + return nf_udp_pernet(net)->timeouts; } static void udp_error_log(const struct sk_buff *skb, @@ -212,7 +207,7 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct nf_udp_net *un = udp_pernet(net); + struct nf_udp_net *un = nf_udp_pernet(net); if (!timeouts) timeouts = un->timeouts; @@ -292,7 +287,7 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int udp_init_net(struct net *net) { - struct nf_udp_net *un = udp_pernet(net); + struct nf_udp_net *un = nf_udp_pernet(net); struct nf_proto_net *pn = &un->pn; if (!pn->users) { -- cgit From 8866df9264a34e675b4ee8a151db819b87cce2d3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 2 Nov 2018 00:14:00 +0100 Subject: netfilter: nfnetlink_cttimeout: pass default timeout policy to obj_to_nlattr Otherwise, we hit a NULL pointer deference since handlers always assume default timeout policy is passed. netlink: 24 bytes leftover after parsing attributes in process `syz-executor2'. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 9575 Comm: syz-executor1 Not tainted 4.19.0+ #312 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:icmp_timeout_obj_to_nlattr+0x77/0x170 net/netfilter/nf_conntrack_proto_icmp.c:297 Fixes: c779e849608a ("netfilter: conntrack: remove get_timeout() indirection") Reported-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cttimeout.c | 47 +++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index e7a50af1b3d6..a518eb162344 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -382,7 +382,8 @@ err: static int cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, u16 l3num, - const struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto, + const unsigned int *timeouts) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -408,7 +409,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, if (!nest_parms) goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL); + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); if (ret < 0) goto nla_put_failure; @@ -430,6 +431,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, struct netlink_ext_ack *extack) { const struct nf_conntrack_l4proto *l4proto; + unsigned int *timeouts = NULL; struct sk_buff *skb2; int ret, err; __u16 l3num; @@ -442,12 +444,44 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find_get(l4num); - /* This protocol is not supported, skip. */ - if (l4proto->l4proto != l4num) { - err = -EOPNOTSUPP; + err = -EOPNOTSUPP; + if (l4proto->l4proto != l4num) goto err; + + switch (l4proto->l4proto) { + case IPPROTO_ICMP: + timeouts = &nf_icmp_pernet(net)->timeout; + break; + case IPPROTO_TCP: + timeouts = nf_tcp_pernet(net)->timeouts; + break; + case IPPROTO_UDP: + timeouts = nf_udp_pernet(net)->timeouts; + break; + case IPPROTO_DCCP: +#ifdef CONFIG_NF_CT_PROTO_DCCP + timeouts = nf_dccp_pernet(net)->dccp_timeout; +#endif + break; + case IPPROTO_ICMPV6: + timeouts = &nf_icmpv6_pernet(net)->timeout; + break; + case IPPROTO_SCTP: +#ifdef CONFIG_NF_CT_PROTO_SCTP + timeouts = nf_sctp_pernet(net)->timeouts; +#endif + break; + case 255: + timeouts = &nf_generic_pernet(net)->timeout; + break; + default: + WARN_ON_ONCE(1); + break; } + if (!timeouts) + goto err; + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { err = -ENOMEM; @@ -458,8 +492,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, - l3num, - l4proto); + l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); err = -ENOMEM; -- cgit From e4844c9c62a0fe47980d6c3d4b7a096a5d755925 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 2 Nov 2018 11:33:37 +0100 Subject: netfilter: nft_compat: ebtables 'nat' table is normal chain type Unlike ip(6)tables, the ebtables nat table has no special properties. This bug causes 'ebtables -A' to fail when using a target such as 'snat' (ebt_snat target sets ".table = "nat"'). Targets that have no table restrictions work fine. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 768292eac2a4..9d0ede474224 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -54,9 +54,11 @@ static bool nft_xt_put(struct nft_xt *xt) return false; } -static int nft_compat_chain_validate_dependency(const char *tablename, - const struct nft_chain *chain) +static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx, + const char *tablename) { + enum nft_chain_types type = NFT_CHAIN_T_DEFAULT; + const struct nft_chain *chain = ctx->chain; const struct nft_base_chain *basechain; if (!tablename || @@ -64,9 +66,12 @@ static int nft_compat_chain_validate_dependency(const char *tablename, return 0; basechain = nft_base_chain(chain); - if (strcmp(tablename, "nat") == 0 && - basechain->type->type != NFT_CHAIN_T_NAT) - return -EINVAL; + if (strcmp(tablename, "nat") == 0) { + if (ctx->family != NFPROTO_BRIDGE) + type = NFT_CHAIN_T_NAT; + if (basechain->type->type != type) + return -EINVAL; + } return 0; } @@ -342,8 +347,7 @@ static int nft_target_validate(const struct nft_ctx *ctx, if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; - ret = nft_compat_chain_validate_dependency(target->table, - ctx->chain); + ret = nft_compat_chain_validate_dependency(ctx, target->table); if (ret < 0) return ret; } @@ -590,8 +594,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; - ret = nft_compat_chain_validate_dependency(match->table, - ctx->chain); + ret = nft_compat_chain_validate_dependency(ctx, match->table); if (ret < 0) return ret; } -- cgit From f393808dc64149ccd0e5a8427505ba2974a59854 Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Thu, 25 Oct 2018 12:15:43 -0700 Subject: netfilter: conntrack: fix calculation of next bucket number in early_drop If there's no entry to drop in bucket that corresponds to the hash, early_drop() should look for it in other buckets. But since it increments hash instead of bucket number, it actually looks in the same bucket 8 times: hsize is 16k by default (14 bits) and hash is 32-bit value, so reciprocal_scale(hash, hsize) returns the same value for hash..hash+7 in most cases. Fix it by increasing bucket number instead of hash and rename _hash to bucket to avoid future confusion. Fixes: 3e86638e9a0b ("netfilter: conntrack: consider ct netns in early_drop logic") Cc: # v4.7+ Signed-off-by: Vasily Khoruzhick Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ca1168d67fac..e92e749aff53 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1073,19 +1073,22 @@ static unsigned int early_drop_list(struct net *net, return drops; } -static noinline int early_drop(struct net *net, unsigned int _hash) +static noinline int early_drop(struct net *net, unsigned int hash) { - unsigned int i; + unsigned int i, bucket; for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { struct hlist_nulls_head *ct_hash; - unsigned int hash, hsize, drops; + unsigned int hsize, drops; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hsize); - hash = reciprocal_scale(_hash++, hsize); + if (!i) + bucket = reciprocal_scale(hash, hsize); + else + bucket = (bucket + 1) % hsize; - drops = early_drop_list(net, &ct_hash[hash]); + drops = early_drop_list(net, &ct_hash[bucket]); rcu_read_unlock(); if (drops) { -- cgit From fe60faa5063822f2d555f4f326c7dd72a60929bf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Oct 2018 08:39:13 -0700 Subject: net: do not abort bulk send on BQL status Before calling dev_hard_start_xmit(), upper layers tried to cook optimal skb list based on BQL budget. Problem is that GSO packets can end up comsuming more than the BQL budget. Breaking the loop is not useful, since requeued packets are ahead of any packets still in the qdisc. It is also more expensive, since next TX completion will push these packets later, while skbs are not in cpu caches. It is also a behavior difference with TSO packets, that can break the BQL limit by a large amount. Note that drivers should use __netdev_tx_sent_queue() in order to have optimal xmit_more support, and avoid useless atomic operations as shown in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 77d43ae2a7bb..0ffcbdd55fa9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3272,7 +3272,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de } skb = next; - if (netif_xmit_stopped(txq) && skb) { + if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } -- cgit From a277d516de5f498c91d91189717ef7e01102ad27 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 2 Nov 2018 16:36:55 +0100 Subject: openvswitch: fix linking without CONFIG_NF_CONNTRACK_LABELS When CONFIG_CC_OPTIMIZE_FOR_DEBUGGING is enabled, the compiler fails to optimize out a dead code path, which leads to a link failure: net/openvswitch/conntrack.o: In function `ovs_ct_set_labels': conntrack.c:(.text+0x2e60): undefined reference to `nf_connlabels_replace' In this configuration, we can take a shortcut, and completely remove the contrack label code. This may also help the regular optimization. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 6bec37ab4472..a4660c48ff01 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1203,7 +1203,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, &info->labels.mask); if (err) return err; - } else if (labels_nonzero(&info->labels.mask)) { + } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + labels_nonzero(&info->labels.mask)) { err = ovs_ct_set_labels(ct, key, &info->labels.value, &info->labels.mask); if (err) -- cgit From 12480e3b16982c4026de10dd8155823219cd6391 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 3 Nov 2018 14:01:31 +0800 Subject: sctp: define SCTP_SS_DEFAULT for Stream schedulers According to rfc8260#section-4.3.2, SCTP_SS_DEFAULT is required to defined as SCTP_SS_FCFS or SCTP_SS_RR. SCTP_SS_FCFS is used for SCTP_SS_DEFAULT's value in this patch. Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations") Reported-by: Jianwen Ji Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 9cb854b05342..c37e1c2dec9d 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -212,7 +212,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); - sctp_sched_set_sched(asoc, SCTP_SS_FCFS); + sctp_sched_set_sched(asoc, SCTP_SS_DEFAULT); } /* Free the outqueue structure and any related pending chunks. -- cgit From 6915ed86cca6a0b422e8ada05ec7009d2074b88a Mon Sep 17 00:00:00 2001 From: Jeff Barnhill <0xeffeff@gmail.com> Date: Mon, 5 Nov 2018 20:36:45 +0000 Subject: net/ipv6: Move anycast init/cleanup functions out of CONFIG_PROC_FS Move the anycast.c init and cleanup functions which were inadvertently added inside the CONFIG_PROC_FS definition. Fixes: 2384d02520ff ("net/ipv6: Add anycast addresses to a global hashtable") Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com> Signed-off-by: David S. Miller --- net/ipv6/anycast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 7698637cf827..94999058e110 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -590,6 +590,7 @@ void ac6_proc_exit(struct net *net) { remove_proc_entry("anycast6", net->proc_net); } +#endif /* Init / cleanup code */ @@ -611,4 +612,3 @@ void ipv6_anycast_cleanup(void) WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); spin_unlock(&acaddr_hash_lock); } -#endif -- cgit From 5e1acb4afacc6229946c3d2b7ffc422b53fb2448 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 2 Nov 2018 19:11:04 +0300 Subject: rtnetlink: restore handling of dumpit return value in rtnl_dump_all() For non-zero return from dumpit() we should break the loop in rtnl_dump_all() and return the result. Otherwise, e.g., we could get the memory leak in inet6_dump_fib() [1]. The pointer to the allocated struct fib6_walker there (saved in cb->args) can be lost, reset on the next iteration. Fix it by partially restoring the previous behavior before commit c63586dc9b3e ("net: rtnl_dump_all needs to propagate error from dumpit function"). The returned error from dumpit() is still passed further. [1]: unreferenced object 0xffff88001322a200 (size 96): comm "sshd", pid 1484, jiffies 4296032768 (age 1432.542s) hex dump (first 32 bytes): 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................ 18 09 41 36 00 88 ff ff 18 09 41 36 00 88 ff ff ..A6......A6.... backtrace: [<0000000095846b39>] kmem_cache_alloc_trace+0x151/0x220 [<000000007d12709f>] inet6_dump_fib+0x68d/0x940 [<000000002775a316>] rtnl_dump_all+0x1d9/0x2d0 [<00000000d7cd302b>] netlink_dump+0x945/0x11a0 [<000000002f43485f>] __netlink_dump_start+0x55d/0x800 [<00000000f76bbeec>] rtnetlink_rcv_msg+0x4fa/0xa00 [<000000009b5761f3>] netlink_rcv_skb+0x29c/0x420 [<0000000087a1dae1>] rtnetlink_rcv+0x15/0x20 [<00000000691b703b>] netlink_unicast+0x4e3/0x6c0 [<00000000b5be0204>] netlink_sendmsg+0x7f2/0xba0 [<0000000096d2aa60>] sock_sendmsg+0xba/0xf0 [<000000008c1b786f>] __sys_sendto+0x1e4/0x330 [<0000000019587b3f>] __x64_sys_sendto+0xe1/0x1a0 [<00000000071f4d56>] do_syscall_64+0x9f/0x300 [<000000002737577f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<0000000057587684>] 0xffffffffffffffff Fixes: c63586dc9b3e ("net: rtnl_dump_all needs to propagate error from dumpit function") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e01274bd5e3e..33d9227a8b80 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3367,7 +3367,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = 0; } ret = dumpit(skb, cb); - if (ret < 0) + if (ret) break; } cb->family = idx; -- cgit From e22d0bfa09a56e427a1a950ccb85621c86b343a4 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 2 Nov 2018 19:11:05 +0300 Subject: ipv6: properly check return value in inet6_dump_all() Make sure we call fib6_dump_end() if it happens that skb->len is zero. rtnl_dump_all() can reset cb->args on the next loop iteration there. Fixes: 08e814c9e8eb ("net/ipv6: Bail early if user only wants cloned entries") Fixes: ae677bbb4441 ("net: Don't return invalid table id error when dumping all families") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1b8bc008b53b..ae3786132c23 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -591,7 +591,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) /* fib entries are never clones */ if (arg.filter.flags & RTM_F_CLONED) - return skb->len; + goto out; w = (void *)cb->args[2]; if (!w) { @@ -621,7 +621,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (arg.filter.dump_all_families) - return skb->len; + goto out; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); return -ENOENT; -- cgit From d016b4a3562b745fd9fa387a47d8de62ccd7e241 Mon Sep 17 00:00:00 2001 From: "Matwey V. Kornilov" Date: Fri, 2 Nov 2018 21:19:36 +0300 Subject: net: core: netpoll: Enable netconsole IPv6 link local address There is no reason to discard using source link local address when remote netconsole IPv6 address is set to be link local one. The patch allows administrators to use IPv6 netconsole without explicitly configuring source address: netconsole=@/,@fe80::5054:ff:fe2f:6012/ Signed-off-by: Matwey V. Kornilov Signed-off-by: David S. Miller --- net/core/netpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5da9552b186b..2b9fdbc43205 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -717,7 +717,8 @@ int netpoll_setup(struct netpoll *np) read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; err = 0; -- cgit From c34c1287778b080ed692c0a46a8e345206cc29e6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 4 Nov 2018 22:37:15 -0800 Subject: sock_diag: fix autoloading of the raw_diag module IPPROTO_RAW isn't registred as an inet protocol, so inet_protos[protocol] is always NULL for it. Cc: Cyrill Gorcunov Cc: Xin Long Fixes: bf2ae2e4bf93 ("sock_diag: request _diag module only when the family or proto has been registered") Signed-off-by: Andrei Vagin Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 6fcc4bc07d19..080a880a1761 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3279,6 +3279,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && + protocol != IPPROTO_RAW && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif -- cgit From 97adaddaa6db7a8af81b9b11e30cbe3628cd6700 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 5 Nov 2018 22:31:41 +0900 Subject: net: bpfilter: fix iptables failure if bpfilter_umh is disabled When iptables command is executed, ip_{set/get}sockopt() try to upload bpfilter.ko if bpfilter is enabled. if it couldn't find bpfilter.ko, command is failed. bpfilter.ko is generated if CONFIG_BPFILTER_UMH is enabled. ip_{set/get}sockopt() only checks CONFIG_BPFILTER. So that if CONFIG_BPFILTER is enabled and CONFIG_BPFILTER_UMH is disabled, iptables command is always failed. test config: CONFIG_BPFILTER=y # CONFIG_BPFILTER_UMH is not set test command: %iptables -L iptables: No chain/target/match by that name. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 26c36cccabdc..fffcc130900e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1246,7 +1246,7 @@ int ip_setsockopt(struct sock *sk, int level, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_SET_REPLACE && optname < BPFILTER_IPT_SET_MAX) err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); @@ -1559,7 +1559,7 @@ int ip_getsockopt(struct sock *sk, int level, int err; err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); @@ -1596,7 +1596,7 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, err = do_ip_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); -- cgit From 0d5b9311baf27bb545f187f12ecfd558220c607d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Nov 2018 17:34:27 -0800 Subject: inet: frags: better deal with smp races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multiple cpus might attempt to insert a new fragment in rhashtable, if for example RPS is buggy, as reported by 배석진 in https://patchwork.ozlabs.org/patch/994601/ We use rhashtable_lookup_get_insert_key() instead of rhashtable_insert_fast() to let cpus losing the race free their own inet_frag_queue and use the one that was inserted by another cpu. Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units") Signed-off-by: Eric Dumazet Reported-by: 배석진 Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index bcb11f3a27c0..760a9e52e02b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -178,21 +178,22 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - void *arg) + void *arg, + struct inet_frag_queue **prev) { struct inet_frags *f = nf->f; struct inet_frag_queue *q; - int err; q = inet_frag_alloc(nf, f, arg); - if (!q) + if (!q) { + *prev = ERR_PTR(-ENOMEM); return NULL; - + } mod_timer(&q->timer, jiffies + nf->timeout); - err = rhashtable_insert_fast(&nf->rhashtable, &q->node, - f->rhash_params); - if (err < 0) { + *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, + &q->node, f->rhash_params); + if (*prev) { q->flags |= INET_FRAG_COMPLETE; inet_frag_kill(q); inet_frag_destroy(q); @@ -204,22 +205,22 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { - struct inet_frag_queue *fq; + struct inet_frag_queue *fq = NULL, *prev; if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) return NULL; rcu_read_lock(); - fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); - if (fq) { + prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + if (!prev) + fq = inet_frag_create(nf, key, &prev); + if (prev && !IS_ERR(prev)) { + fq = prev; if (!refcount_inc_not_zero(&fq->refcnt)) fq = NULL; - rcu_read_unlock(); - return fq; } rcu_read_unlock(); - - return inet_frag_create(nf, key); + return fq; } EXPORT_SYMBOL(inet_frag_find); -- cgit From 62230715fd2453b3ba948c9d83cfb3ada9169169 Mon Sep 17 00:00:00 2001 From: 배석진 Date: Fri, 9 Nov 2018 16:53:06 -0800 Subject: flow_dissector: do not dissect l4 ports for fragments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only first fragment has the sport/dport information, not the following ones. If we want consistent hash for all fragments, we need to ignore ports even for first fragment. This bug is visible for IPv6 traffic, if incoming fragments do not have a flow label, since skb_get_hash() will give different results for first fragment and following ones. It is also visible if any routing rule wants dissection and sport or dport. See commit 5e5d6fed3741 ("ipv6: route: dissect flow in input path if fib rules need it") for details. [edumazet] rewrote the changelog completely. Fixes: 06635a35d13d ("flow_dissect: use programable dissector in skb_flow_dissect and friends") Signed-off-by: 배석진 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 676f3ad629f9..588f475019d4 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1166,8 +1166,8 @@ ip_proto_again: break; } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS) && + !(key_control->flags & FLOW_DIS_IS_FRAGMENT)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); -- cgit From 63c82997f5c0f3e1b914af43d82f712a86bc5f3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 Nov 2018 21:06:26 -0800 Subject: net: sched: cls_flower: validate nested enc_opts_policy to avoid warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TCA_FLOWER_KEY_ENC_OPTS and TCA_FLOWER_KEY_ENC_OPTS_MASK can only currently contain further nested attributes, which are parsed by hand, so the policy is never actually used resulting in a W=1 build warning: net/sched/cls_flower.c:492:1: warning: ‘enc_opts_policy’ defined but not used [-Wunused-const-variable=] enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { Add the validation anyway to avoid potential bugs when other attributes are added and to make the attribute structure slightly more clear. Validation will also set extact to point to bad attribute on error. Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options") Signed-off-by: Jakub Kicinski Acked-by: Simon Horman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9aada2d0ef06..c6c327874abc 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -709,11 +709,23 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, struct netlink_ext_ack *extack) { const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL; - int option_len, key_depth, msk_depth = 0; + int err, option_len, key_depth, msk_depth = 0; + + err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS], + TCA_FLOWER_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]); if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) { + err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK], + TCA_FLOWER_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; + nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); } -- cgit From 7ab412d33b4c7ff3e0148d3db25dd861edd1283d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Sat, 10 Nov 2018 17:30:24 -0500 Subject: tipc: fix link re-establish failure When a link failure is detected locally, the link is reset, the flag link->in_session is set to false, and a RESET_MSG with the 'stopping' bit set is sent to the peer. The purpose of this bit is to inform the peer that this endpoint just is going down, and that the peer should handle the reception of this particular RESET message as a local failure. This forces the peer to accept another RESET or ACTIVATE message from this endpoint before it can re-establish the link. This again is necessary to ensure that link session numbers are properly exchanged before the link comes up again. If a failure is detected locally at the same time at the peer endpoint this will do the same, which is also a correct behavior. However, when receiving such messages, the endpoints will not distinguish between 'stopping' RESETs and ordinary ones when it comes to updating session numbers. Both endpoints will copy the received session number and set their 'in_session' flags to true at the reception, while they are still expecting another RESET from the peer before they can go ahead and re-establish. This is contradictory, since, after applying the validation check referred to below, the 'in_session' flag will cause rejection of all such messages, and the link will never come up again. We now fix this by not only handling received RESET/STOPPING messages as a local failure, but also by omitting to set a new session number and the 'in_session' flag in such cases. Fixes: 7ea817f4e832 ("tipc: check session number before accepting link protocol messages") Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 201c3b5bc96b..836727e363c4 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1594,14 +1594,17 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) l->priority = peers_prio; - /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ - if (msg_peer_stopping(hdr)) + /* If peer is going down we want full re-establish cycle */ + if (msg_peer_stopping(hdr)) { rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); - else if ((mtyp == RESET_MSG) || !link_is_up(l)) + break; + } + /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ + if (mtyp == RESET_MSG || !link_is_up(l)) rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); /* ACTIVATE_MSG takes up link if it was already locally reset */ - if ((mtyp == ACTIVATE_MSG) && (l->state == LINK_ESTABLISHING)) + if (mtyp == ACTIVATE_MSG && l->state == LINK_ESTABLISHING) rc = TIPC_LINK_UP_EVT; l->peer_session = msg_session(hdr); -- cgit From 7236ead1b14923f3ba35cd29cce13246be83f451 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 10 Nov 2018 16:22:29 -0800 Subject: act_mirred: clear skb->tstamp on redirect If sch_fq is used at ingress, skbs that might have been timestamped by net_timestamp_set() if a packet capture is requesting timestamps could be delayed by arbitrary amount of time, since sch_fq time base is MONOTONIC. Fix this problem by moving code from sch_netem.c to act_mirred.c. Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 3 ++- net/sched/sch_netem.c | 9 --------- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1dae5f2b358f..c8cf4d10c435 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -258,7 +258,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (is_redirect) { skb2->tc_redirected = 1; skb2->tc_from_ingress = skb2->tc_at_ingress; - + if (skb2->tc_from_ingress) + skb2->tstamp = 0; /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 57b3ad9394ad..2c38e3d07924 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -648,15 +648,6 @@ deliver: */ skb->dev = qdisc_dev(sch); -#ifdef CONFIG_NET_CLS_ACT - /* - * If it's at ingress let's pretend the delay is - * from the network (tstamp will be updated). - */ - if (skb->tc_redirected && skb->tc_from_ingress) - skb->tstamp = 0; -#endif - if (q->slot.slot_next) { q->slot.packets_left--; q->slot.bytes_left -= qdisc_pkt_len(skb); -- cgit