From 733ef7f056a5e23b66e8e7bb3508ca882db388f0 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 18 Sep 2019 09:57:39 +0200 Subject: xsk: relax UMEM headroom alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch removes the 64B alignment of the UMEM headroom. There is really no reason for it, and having a headroom less than 64B should be valid. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 947b8ff0227e..cdaef54d48be 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -383,8 +383,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - headroom = ALIGN(headroom, 64); - size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; if (size_chk < 0) return -EINVAL; -- cgit From cf0eba334268563152e4a8bc9ab865d0037a7948 Mon Sep 17 00:00:00 2001 From: Vijay Khemka Date: Thu, 12 Sep 2019 12:04:50 -0700 Subject: net/ncsi: Disable global multicast filter Disabling multicast filtering from NCSI if it is supported. As it should not filter any multicast packets. In current code, multicast filter is enabled and with an exception of optional field supported by device are disabled filtering. Mainly I see if goal is to disable filtering for IPV6 packets then let it disabled for every other types as well. As we are seeing issues with LLDP not working with this enabled filtering. And there are other issues with IPV6. By Disabling this multicast completely, it is working for both IPV6 as well as LLDP. Signed-off-by: Vijay Khemka Acked-by: Samuel Mendoza-Jonas Signed-off-by: Jakub Kicinski --- net/ncsi/internal.h | 7 +--- net/ncsi/ncsi-manage.c | 98 ++++++-------------------------------------------- 2 files changed, 12 insertions(+), 93 deletions(-) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 0b3f0673e1a2..ad3fd7f1da75 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -264,9 +264,7 @@ enum { ncsi_dev_state_config_ev, ncsi_dev_state_config_sma, ncsi_dev_state_config_ebf, -#if IS_ENABLED(CONFIG_IPV6) - ncsi_dev_state_config_egmf, -#endif + ncsi_dev_state_config_dgmf, ncsi_dev_state_config_ecnt, ncsi_dev_state_config_ec, ncsi_dev_state_config_ae, @@ -295,9 +293,6 @@ struct ncsi_dev_priv { #define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ spinlock_t lock; /* Protect the NCSI device */ -#if IS_ENABLED(CONFIG_IPV6) - unsigned int inet6_addr_num; /* Number of IPv6 addresses */ -#endif unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 755aab66dcab..70fe02697544 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include "internal.h" @@ -978,9 +977,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) case ncsi_dev_state_config_ev: case ncsi_dev_state_config_sma: case ncsi_dev_state_config_ebf: -#if IS_ENABLED(CONFIG_IPV6) - case ncsi_dev_state_config_egmf: -#endif + case ncsi_dev_state_config_dgmf: case ncsi_dev_state_config_ecnt: case ncsi_dev_state_config_ec: case ncsi_dev_state_config_ae: @@ -1033,23 +1030,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } else if (nd->state == ncsi_dev_state_config_ebf) { nca.type = NCSI_PKT_CMD_EBF; nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; - if (ncsi_channel_is_tx(ndp, nc)) + /* if multicast global filtering is supported then + * disable it so that all multicast packet will be + * forwarded to management controller + */ + if (nc->caps[NCSI_CAP_GENERIC].cap & + NCSI_CAP_GENERIC_MC) + nd->state = ncsi_dev_state_config_dgmf; + else if (ncsi_channel_is_tx(ndp, nc)) nd->state = ncsi_dev_state_config_ecnt; else nd->state = ncsi_dev_state_config_ec; -#if IS_ENABLED(CONFIG_IPV6) - if (ndp->inet6_addr_num > 0 && - (nc->caps[NCSI_CAP_GENERIC].cap & - NCSI_CAP_GENERIC_MC)) - nd->state = ncsi_dev_state_config_egmf; - } else if (nd->state == ncsi_dev_state_config_egmf) { - nca.type = NCSI_PKT_CMD_EGMF; - nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; + } else if (nd->state == ncsi_dev_state_config_dgmf) { + nca.type = NCSI_PKT_CMD_DGMF; if (ncsi_channel_is_tx(ndp, nc)) nd->state = ncsi_dev_state_config_ecnt; else nd->state = ncsi_dev_state_config_ec; -#endif /* CONFIG_IPV6 */ } else if (nd->state == ncsi_dev_state_config_ecnt) { if (np->preferred_channel && nc != np->preferred_channel) @@ -1483,70 +1480,6 @@ out: return -ENODEV; } -#if IS_ENABLED(CONFIG_IPV6) -static int ncsi_inet6addr_event(struct notifier_block *this, - unsigned long event, void *data) -{ - struct inet6_ifaddr *ifa = data; - struct net_device *dev = ifa->idev->dev; - struct ncsi_dev *nd = ncsi_find_dev(dev); - struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; - struct ncsi_package *np; - struct ncsi_channel *nc; - struct ncsi_cmd_arg nca; - bool action; - int ret; - - if (!ndp || (ipv6_addr_type(&ifa->addr) & - (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK))) - return NOTIFY_OK; - - switch (event) { - case NETDEV_UP: - action = (++ndp->inet6_addr_num) == 1; - nca.type = NCSI_PKT_CMD_EGMF; - break; - case NETDEV_DOWN: - action = (--ndp->inet6_addr_num == 0); - nca.type = NCSI_PKT_CMD_DGMF; - break; - default: - return NOTIFY_OK; - } - - /* We might not have active channel or packages. The IPv6 - * required multicast will be enabled when active channel - * or packages are chosen. - */ - np = ndp->active_package; - nc = ndp->active_channel; - if (!action || !np || !nc) - return NOTIFY_OK; - - /* We needn't enable or disable it if the function isn't supported */ - if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC)) - return NOTIFY_OK; - - nca.ndp = ndp; - nca.req_flags = 0; - nca.package = np->id; - nca.channel = nc->id; - nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; - ret = ncsi_xmit_cmd(&nca); - if (ret) { - netdev_warn(dev, "Fail to %s global multicast filter (%d)\n", - (event == NETDEV_UP) ? "enable" : "disable", ret); - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} - -static struct notifier_block ncsi_inet6addr_notifier = { - .notifier_call = ncsi_inet6addr_event, -}; -#endif /* CONFIG_IPV6 */ - static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; @@ -1725,11 +1658,6 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, } spin_lock_irqsave(&ncsi_dev_lock, flags); -#if IS_ENABLED(CONFIG_IPV6) - ndp->inet6_addr_num = 0; - if (list_empty(&ncsi_dev_list)) - register_inet6addr_notifier(&ncsi_inet6addr_notifier); -#endif list_add_tail_rcu(&ndp->node, &ncsi_dev_list); spin_unlock_irqrestore(&ncsi_dev_lock, flags); @@ -1896,10 +1824,6 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) spin_lock_irqsave(&ncsi_dev_lock, flags); list_del_rcu(&ndp->node); -#if IS_ENABLED(CONFIG_IPV6) - if (list_empty(&ncsi_dev_list)) - unregister_inet6addr_notifier(&ncsi_inet6addr_notifier); -#endif spin_unlock_irqrestore(&ncsi_dev_lock, flags); ncsi_unregister_netlink(nd->dev); -- cgit From ad652f3811d8644d547506154ec9a9c22c8771cd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 16 Sep 2019 18:33:08 +0200 Subject: netfilter: nf_tables: add NFT_CHAIN_POLICY_UNSET and use it Default policy is defined as a unsigned 8-bit field, do not use a negative value to leave it unset, use this new NFT_CHAIN_POLICY_UNSET instead. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e4a68dc42694..4a5d6ef2b706 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1715,7 +1715,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err2; } - nft_trans_chain_policy(trans) = -1; + nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; -- cgit From ff175d0b0eab99f512b9afdb571f0ed18b63f533 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 16 Sep 2019 18:33:09 +0200 Subject: netfilter: nf_tables_offload: fix always true policy is unset check New smatch warnings: net/netfilter/nf_tables_offload.c:316 nft_flow_offload_chain() warn: always true condition '(policy != -1) => (0-255 != (-1))' Reported-by: kbuild test robot Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 21bb772cb4b7..e546f759b7a7 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -313,7 +313,7 @@ static int nft_flow_offload_chain(struct nft_chain *chain, policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ - if (cmd == FLOW_BLOCK_BIND && policy != -1 && policy != NF_ACCEPT) + if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; if (dev->netdev_ops->ndo_setup_tc) -- cgit From acab713177377d9e0889c46bac7ff0cfb9a90c4d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 19 Sep 2019 16:56:44 +0200 Subject: netfilter: nf_tables: allow lookups in dynamic sets This un-breaks lookups in sets that have the 'dynamic' flag set. Given this active example configuration: table filter { set set1 { type ipv4_addr size 64 flags dynamic,timeout timeout 1m } chain input { type filter hook input priority 0; policy accept; } } ... this works: nft add rule ip filter input add @set1 { ip saddr } -> whenever rule is triggered, the source ip address is inserted into the set (if it did not exist). This won't work: nft add rule ip filter input ip saddr @set1 counter Error: Could not process rule: Operation not supported In other words, we can add entries to the set, but then can't make matching decision based on that set. That is just wrong -- all set backends support lookups (else they would not be very useful). The failure comes from an explicit rejection in nft_lookup.c. Looking at the history, it seems like NFT_SET_EVAL used to mean 'set contains expressions' (aka. "is a meter"), for instance something like nft add rule ip filter input meter example { ip saddr limit rate 10/second } or nft add rule ip filter input meter example { ip saddr counter } The actual meaning of NFT_SET_EVAL however, is 'set can be updated from the packet path'. 'meters' and packet-path insertions into sets, such as 'add @set { ip saddr }' use exactly the same kernel code (nft_dynset.c) and thus require a set backend that provides the ->update() function. The only set that provides this also is the only one that has the NFT_SET_EVAL feature flag. Removing the wrong check makes the above example work. While at it, also fix the flag check during set instantiation to allow supported combinations only. Fixes: 8aeff920dcc9b3f ("netfilter: nf_tables: add stateful object reference to set elements") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +++++-- net/netfilter/nft_lookup.c | 3 --- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4a5d6ef2b706..6dc46f9b5f7b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3562,8 +3562,11 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, NFT_SET_OBJECT)) return -EINVAL; /* Only one of these operations is supported */ - if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) == - (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) + if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == + (NFT_SET_MAP | NFT_SET_OBJECT)) + return -EOPNOTSUPP; + if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) == + (NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index c0560bf3c31b..660bad688e2b 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -73,9 +73,6 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_EVAL) - return -EOPNOTSUPP; - priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); err = nft_validate_register_load(priv->sreg, set->klen); if (err < 0) -- cgit From 92974a1d006ad8b30d53047c70974c9e065eb7df Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 17 Sep 2019 11:30:55 +0200 Subject: net/sched: act_sample: don't push mac header on ip6gre ingress current 'sample' action doesn't push the mac header of ingress packets if they are received by a layer 3 tunnel (like gre or sit); but it forgot to check for gre over ipv6, so the following script: # tc q a dev $d clsact # tc f a dev $d ingress protocol ip flower ip_proto icmp action sample \ > group 100 rate 1 # psample -v -g 100 dumps everything, including outer header and mac, when $d is a gre tunnel over ipv6. Fix this adding a missing label for ARPHRD_IP6GRE devices. Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action") Signed-off-by: Davide Caratti Reviewed-by: Yotam Gigi Signed-off-by: Jakub Kicinski --- net/sched/act_sample.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 692c4c9040fd..514456a0b9a8 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -146,6 +146,7 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev) case ARPHRD_TUNNEL6: case ARPHRD_SIT: case ARPHRD_IPGRE: + case ARPHRD_IP6GRE: case ARPHRD_VOID: case ARPHRD_NONE: return false; -- cgit From 77d5bc7e6a6cf8bbeca31aab7f0c5449a5eee762 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Sep 2019 10:39:49 -0700 Subject: ipv4: Revert removal of rt_uses_gateway Julian noted that rt_uses_gateway has a more subtle use than 'is gateway set': https://lore.kernel.org/netdev/alpine.LFD.2.21.1909151104060.2546@ja.home.ssi.bg/ Revert that part of the commit referenced in the Fixes tag. Currently, there are no u8 holes in 'struct rtable'. There is a 4-byte hole in the second cacheline which contains the gateway declaration. So move rt_gw_family down to the gateway declarations since they are always used together, and then re-use that u8 for rt_uses_gateway. End result is that rtable size is unchanged. Fixes: 1550c171935d ("ipv4: Prepare rtable for IPv6 gateway") Reported-by: Julian Anastasov Signed-off-by: David Ahern Reviewed-by: Julian Anastasov Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/route.c | 36 +++++++++++++++++++++--------------- net/ipv4/xfrm4_policy.c | 1 + 5 files changed, 26 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f5c163d4771b..a9183543ca30 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -560,7 +560,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; @@ -598,7 +598,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 06f6f280b9ff..00ec819f949b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_gw_family) + if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; IPCB(skb)->flags |= IPSKB_FORWARDED; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5eb73775c3f7..a77c3a4c24de 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -499,7 +499,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b6a6f18c3dd1..7dcce724c78b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -635,6 +635,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } @@ -1313,7 +1314,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_gw_family && mtu > 576) + if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } @@ -1569,6 +1570,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { + rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) @@ -1634,6 +1636,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; + rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); @@ -2694,6 +2697,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; + rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; @@ -2778,21 +2782,23 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_gw_family == AF_INET && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { - goto nla_put_failure; - } else if (rt->rt_gw_family == AF_INET6) { - int alen = sizeof(struct in6_addr); - struct nlattr *nla; - struct rtvia *via; - - nla = nla_reserve(skb, RTA_VIA, alen + 2); - if (!nla) + if (rt->rt_uses_gateway) { + if (rt->rt_gw_family == AF_INET && + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; - - via = nla_data(nla); - via->rtvia_family = AF_INET6; - memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } else if (rt->rt_gw_family == AF_INET6) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } } expires = rt->dst.expires; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index cdef8f9a3b01..35b84b52b702 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -85,6 +85,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; + xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) xdst->u.rt.rt_gw4 = rt->rt_gw4; -- cgit From b41d936b5ecfdb3a4abc525ce6402a6c49cffddc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Sep 2019 08:05:39 -0700 Subject: sch_netem: fix a divide by zero in tabledist() syzbot managed to crash the kernel in tabledist() loading an empty distribution table. t = dist->table[rnd % dist->size]; Simply return an error when such load is attempted. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b17f2ed970e2..f5cb35e550f8 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -777,7 +777,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, struct disttable *d; int i; - if (n > NETEM_DIST_MAX) + if (!n || n > NETEM_DIST_MAX) return -EINVAL; d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); -- cgit From 7b09c2d052db4b4ad0b27b97918b46a7746966fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Sep 2019 10:12:36 -0700 Subject: ipv6: fix a typo in fib6_rule_lookup() Yi Ren reported an issue discovered by syzkaller, and bisected to the cited commit. Many thanks to Yi, this trivial patch does not reflect the patient work that has been done. Fixes: d64a1f574a29 ("ipv6: honor RT6_LOOKUP_F_DST_NOREF in rule lookup logic") Signed-off-by: Eric Dumazet Acked-by: Wei Wang Bisected-and-reported-by: Yi Ren Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 87f47bc55c5e..6e2af411cd9c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -318,7 +318,7 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } -- cgit From 73f0c11d11329a0d6d205d4312b6e5d2512af7c5 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 18 Sep 2019 10:21:17 -0700 Subject: net: qrtr: Stop rx_worker before freeing node As the endpoint is unregistered there might still be work pending to handle incoming messages, which will result in a use after free scenario. The plan is to remove the rx_worker, but until then (and for stable@) ensure that the work is stopped before the node is freed. Fixes: bdabad3e363d ("net: Add Qualcomm IPC router") Cc: stable@vger.kernel.org Signed-off-by: Bjorn Andersson Signed-off-by: Jakub Kicinski --- net/qrtr/qrtr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 6c8b0f6d28f9..88f98f27ad88 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -150,6 +150,7 @@ static void __qrtr_node_release(struct kref *kref) list_del(&node->item); mutex_unlock(&qrtr_node_lock); + cancel_work_sync(&node->work); skb_queue_purge(&node->rx_queue); kfree(node); } -- cgit From 62794fc4fbf52f2209dc094ea255eaef760e7d01 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 18 Sep 2019 16:24:12 -0700 Subject: net_sched: add max len check for TCA_KIND The TCA_KIND attribute is of NLA_STRING which does not check the NUL char. KMSAN reported an uninit-value of TCA_KIND which is likely caused by the lack of NUL. Change it to NLA_NUL_STRING and add a max len too. Fixes: 8b4c3cdd9dd8 ("net: sched: Add policy validation for tc attributes") Reported-and-tested-by: syzbot+618aacd49e8c8b8486bd@syzkaller.appspotmail.com Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Reviewed-by: David Ahern Acked-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1047825d9f48..81d58b280612 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1390,7 +1390,8 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) } const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { - [TCA_KIND] = { .type = NLA_STRING }, + [TCA_KIND] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 }, [TCA_RATE] = { .type = NLA_BINARY, .len = sizeof(struct tc_estimator) }, [TCA_STAB] = { .type = NLA_NESTED }, -- cgit From 199ce850ce112315cfc68d42b694bcaa27b097b7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 18 Sep 2019 18:44:43 -0700 Subject: net_sched: add policy validation for action attributes Similar to commit 8b4c3cdd9dd8 ("net: sched: Add policy validation for tc attributes"), we need to add proper policy validation for TC action attributes too. Cc: David Ahern Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/sched/act_api.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 339712296164..2558f00f6b3e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -831,6 +831,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { + [TCA_ACT_KIND] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 }, + [TCA_ACT_INDEX] = { .type = NLA_U32 }, + [TCA_ACT_COOKIE] = { .type = NLA_BINARY, + .len = TC_COOKIE_MAX_SIZE }, + [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, +}; + struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, @@ -846,8 +855,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int err; if (name == NULL) { - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, - extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; err = -EINVAL; @@ -856,18 +865,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, NL_SET_ERR_MSG(extack, "TC action kind must be specified"); goto err_out; } - if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) { - NL_SET_ERR_MSG(extack, "TC action name too long"); - goto err_out; - } - if (tb[TCA_ACT_COOKIE]) { - int cklen = nla_len(tb[TCA_ACT_COOKIE]); - - if (cklen > TC_COOKIE_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "TC cookie size above the maximum"); - goto err_out; - } + nla_strlcpy(act_name, kind, IFNAMSIZ); + if (tb[TCA_ACT_COOKIE]) { cookie = nla_memdup_cookie(tb); if (!cookie) { NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); @@ -1098,7 +1098,8 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, int index; int err; - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; @@ -1152,7 +1153,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, b = skb_tail_pointer(skb); - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; @@ -1440,7 +1442,7 @@ static struct nlattr *find_dump_kind(struct nlattr **nla) if (tb[1] == NULL) return NULL; - if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0) + if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0) return NULL; kind = tb2[TCA_ACT_KIND]; -- cgit From 3d66b89c30f9220a72e92847768fc8ba4d027d88 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Sep 2019 12:57:04 -0700 Subject: net: sched: fix possible crash in tcf_action_destroy() If the allocation done in tcf_exts_init() failed, we end up with a NULL pointer in exts->actions. kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8198 Comm: syz-executor.3 Not tainted 5.3.0-rc8+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:tcf_action_destroy+0x71/0x160 net/sched/act_api.c:705 Code: c3 08 44 89 ee e8 4f cb bb fb 41 83 fd 20 0f 84 c9 00 00 00 e8 c0 c9 bb fb 48 89 d8 48 b9 00 00 00 00 00 fc ff df 48 c1 e8 03 <80> 3c 08 00 0f 85 c0 00 00 00 4c 8b 33 4d 85 f6 0f 84 9d 00 00 00 RSP: 0018:ffff888096e16ff0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: dffffc0000000000 RDX: 0000000000040000 RSI: ffffffff85b6ab30 RDI: 0000000000000000 RBP: ffff888096e17020 R08: ffff8880993f6140 R09: fffffbfff11cae67 R10: fffffbfff11cae66 R11: ffffffff88e57333 R12: 0000000000000000 R13: 0000000000000000 R14: ffff888096e177a0 R15: 0000000000000001 FS: 00007f62bc84a700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000758040 CR3: 0000000088b64000 CR4: 00000000001426e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcf_exts_destroy+0x38/0xb0 net/sched/cls_api.c:3030 tcindex_set_parms+0xf7f/0x1e50 net/sched/cls_tcindex.c:488 tcindex_change+0x230/0x318 net/sched/cls_tcindex.c:519 tc_new_tfilter+0xa4b/0x1c70 net/sched/cls_api.c:2152 rtnetlink_rcv_msg+0x838/0xb00 net/core/rtnetlink.c:5214 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5241 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x531/0x710 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x8a5/0xd60 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] Fixes: 90b73b77d08e ("net: sched: change action API to use array of pointers to actions") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Vlad Buslov Cc: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 32577c248968..64584a1df425 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2894,8 +2894,10 @@ out: void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); - kfree(exts->actions); + if (exts->actions) { + tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); + kfree(exts->actions); + } exts->nr_actions = 0; #endif } -- cgit From 6cc03e8aa36c51f3b26a0d21a3c4ce2809c842ac Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:46 +0200 Subject: appletalk: enforce CAP_NET_RAW for raw sockets When creating a raw AF_APPLETALK socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 4072e9d394d6..b41375d4d295 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1023,6 +1023,11 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol, */ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) goto out; + + rc = -EPERM; + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out; + rc = -ENOMEM; sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern); if (!sk) -- cgit From 0614e2b73768b502fc32a75349823356d98aae2c Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:47 +0200 Subject: ax25: enforce CAP_NET_RAW for raw sockets When creating a raw AF_AX25 socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ca5207767dc2..bb222b882b67 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -855,6 +855,8 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, break; case SOCK_RAW: + if (!capable(CAP_NET_RAW)) + return -EPERM; break; default: return -ESOCKTNOSUPPORT; -- cgit From e69dbd4619e7674c1679cba49afd9dd9ac347eef Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:48 +0200 Subject: ieee802154: enforce CAP_NET_RAW for raw sockets When creating a raw AF_IEEE802154 socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Acked-by: Stefan Schmidt Signed-off-by: David S. Miller --- net/ieee802154/socket.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index badc5cfe4dc6..d93d4531aa9b 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1008,6 +1008,9 @@ static int ieee802154_create(struct net *net, struct socket *sock, switch (sock->type) { case SOCK_RAW: + rc = -EPERM; + if (!capable(CAP_NET_RAW)) + goto out; proto = &ieee802154_raw_prot; ops = &ieee802154_raw_ops; break; -- cgit From 3a359798b176183ef09efb7a3dc59abad1cc7104 Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:49 +0200 Subject: nfc: enforce CAP_NET_RAW for raw sockets When creating a raw AF_NFC socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/nfc/llcp_sock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 9b8742947aff..8dfea26536c9 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -1004,10 +1004,13 @@ static int llcp_sock_create(struct net *net, struct socket *sock, sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - if (sock->type == SOCK_RAW) + if (sock->type == SOCK_RAW) { + if (!capable(CAP_NET_RAW)) + return -EPERM; sock->ops = &llcp_rawsock_ops; - else + } else { sock->ops = &llcp_sock_ops; + } sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) -- cgit From 9b05b6e11d5e93a3a517cadc12b9836e0470c255 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 24 Sep 2019 14:42:44 +0200 Subject: netfilter: nf_tables: bogus EBUSY when deleting flowtable after flush The deletion of a flowtable after a flush in the same transaction results in EBUSY. This patch adds an activation and deactivation of flowtables in order to update the _use_ counter. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 ++++++++++++++++ net/netfilter/nft_flow_offload.c | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6dc46f9b5f7b..d481f9baca2f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5598,6 +5598,22 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nft_flowtable_lookup); +void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, + struct nft_flowtable *flowtable, + enum nft_trans_phase phase) +{ + switch (phase) { + case NFT_TRANS_PREPARE: + case NFT_TRANS_ABORT: + case NFT_TRANS_RELEASE: + flowtable->use--; + /* fall through */ + default: + return; + } +} +EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable); + static struct nft_flowtable * nft_flowtable_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 22cf236eb5d5..f29bbc74c4bf 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -177,6 +177,23 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } +static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + enum nft_trans_phase phase) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); +} + +static void nft_flow_offload_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + priv->flowtable->use++; +} + static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -205,6 +222,8 @@ static const struct nft_expr_ops nft_flow_offload_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, + .activate = nft_flow_offload_activate, + .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, -- cgit From bf69abad27d8fe1daca9558441fd0205fb2d7bc9 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 23 Sep 2019 17:52:42 +0200 Subject: net: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Signed-off-by: Krzysztof Kozlowski Acked-by: Sven Eckelmann Signed-off-by: David S. Miller --- net/batman-adv/Kconfig | 10 ++-- net/ife/Kconfig | 2 +- net/ipv4/Kconfig | 4 +- net/ipv6/netfilter/Kconfig | 16 ++--- net/netfilter/Kconfig | 2 +- net/netfilter/ipvs/Kconfig | 6 +- net/rds/Kconfig | 4 +- net/sched/Kconfig | 144 ++++++++++++++++++++++----------------------- 8 files changed, 94 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index a3d188dfbe75..d5028af750d5 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -12,11 +12,11 @@ config BATMAN_ADV depends on NET select LIBCRC32C help - B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is - a routing protocol for multi-hop ad-hoc mesh networks. The - networks may be wired or wireless. See - https://www.open-mesh.org/ for more information and user space - tools. + B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is + a routing protocol for multi-hop ad-hoc mesh networks. The + networks may be wired or wireless. See + https://www.open-mesh.org/ for more information and user space + tools. config BATMAN_ADV_BATMAN_V bool "B.A.T.M.A.N. V protocol" diff --git a/net/ife/Kconfig b/net/ife/Kconfig index 6cd1f6d18f30..bcf650564db4 100644 --- a/net/ife/Kconfig +++ b/net/ife/Kconfig @@ -5,7 +5,7 @@ menuconfig NET_IFE depends on NET - tristate "Inter-FE based on IETF ForCES InterFE LFB" + tristate "Inter-FE based on IETF ForCES InterFE LFB" default n help Say Y here to add support of IFE encapsulation protocol diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 974de4d20f25..03381f3e12ba 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -492,8 +492,8 @@ config TCP_CONG_WESTWOOD wired networks and throughput over wireless links. config TCP_CONG_HTCP - tristate "H-TCP" - default m + tristate "H-TCP" + default m ---help--- H-TCP is a send-side only modifications of the TCP Reno protocol stack that optimizes the performance of TCP diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6120a7800975..69443e9a3aa5 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -170,13 +170,13 @@ config IP6_NF_MATCH_RT To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_SRH - tristate '"srh" Segment Routing header match support' - depends on NETFILTER_ADVANCED - help - srh matching allows you to match packets based on the segment + tristate '"srh" Segment Routing header match support' + depends on NETFILTER_ADVANCED + help + srh matching allows you to match packets based on the segment routing header of the packet. - To compile it as a module, choose M here. If unsure, say N. + To compile it as a module, choose M here. If unsure, say N. # The targets config IP6_NF_TARGET_HL @@ -249,10 +249,10 @@ config IP6_NF_SECURITY depends on SECURITY depends on NETFILTER_ADVANCED help - This option adds a `security' table to iptables, for use - with Mandatory Access Control (MAC) policy. + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. - If unsure, say N. + If unsure, say N. config IP6_NF_NAT tristate "ip6tables NAT support" diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 34ec7afec116..91efae88e8c2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -697,7 +697,7 @@ config NF_FLOW_TABLE_INET tristate "Netfilter flow table mixed IPv4/IPv6 module" depends on NF_FLOW_TABLE help - This option adds the flow table mixed IPv4/IPv6 support. + This option adds the flow table mixed IPv4/IPv6 support. To compile it as a module, choose M here. diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index f6f1a0d5c47d..5b672e05d758 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -135,7 +135,7 @@ config IP_VS_WRR module, choose M here. If unsure, say N. config IP_VS_LC - tristate "least-connection scheduling" + tristate "least-connection scheduling" ---help--- The least-connection scheduling algorithm directs network connections to the server with the least number of active @@ -145,7 +145,7 @@ config IP_VS_LC module, choose M here. If unsure, say N. config IP_VS_WLC - tristate "weighted least-connection scheduling" + tristate "weighted least-connection scheduling" ---help--- The weighted least-connection scheduling algorithm directs network connections to the server with the least active connections @@ -333,7 +333,7 @@ config IP_VS_NFCT config IP_VS_PE_SIP tristate "SIP persistence engine" - depends on IP_VS_PROTO_UDP + depends on IP_VS_PROTO_UDP depends on NF_CONNTRACK_SIP ---help--- Allow persistence based on the SIP Call-ID diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 38ea7f0f2699..c64e154bc18f 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -23,6 +23,6 @@ config RDS_TCP This transport does not support RDMA operations. config RDS_DEBUG - bool "RDS debugging messages" + bool "RDS debugging messages" depends on RDS - default n + default n diff --git a/net/sched/Kconfig b/net/sched/Kconfig index b3faafeafab9..5b044ae6dc1e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -324,7 +324,7 @@ config NET_SCH_CAKE tristate "Common Applications Kept Enhanced (CAKE)" help Say Y here if you want to use the Common Applications Kept Enhanced - (CAKE) queue management algorithm. + (CAKE) queue management algorithm. To compile this driver as a module, choose M here: the module will be called sch_cake. @@ -730,8 +730,8 @@ config NET_CLS_ACT config NET_ACT_POLICE tristate "Traffic Policing" - depends on NET_CLS_ACT - ---help--- + depends on NET_CLS_ACT + ---help--- Say Y here if you want to do traffic policing, i.e. strict bandwidth limiting. This action replaces the existing policing module. @@ -740,9 +740,9 @@ config NET_ACT_POLICE module will be called act_police. config NET_ACT_GACT - tristate "Generic actions" - depends on NET_CLS_ACT - ---help--- + tristate "Generic actions" + depends on NET_CLS_ACT + ---help--- Say Y here to take generic actions such as dropping and accepting packets. @@ -750,15 +750,15 @@ config NET_ACT_GACT module will be called act_gact. config GACT_PROB - bool "Probability support" - depends on NET_ACT_GACT - ---help--- + bool "Probability support" + depends on NET_ACT_GACT + ---help--- Say Y here to use the generic action randomly or deterministically. config NET_ACT_MIRRED - tristate "Redirecting and Mirroring" - depends on NET_CLS_ACT - ---help--- + tristate "Redirecting and Mirroring" + depends on NET_CLS_ACT + ---help--- Say Y here to allow packets to be mirrored or redirected to other devices. @@ -766,10 +766,10 @@ config NET_ACT_MIRRED module will be called act_mirred. config NET_ACT_SAMPLE - tristate "Traffic Sampling" - depends on NET_CLS_ACT - select PSAMPLE - ---help--- + tristate "Traffic Sampling" + depends on NET_CLS_ACT + select PSAMPLE + ---help--- Say Y here to allow packet sampling tc action. The packet sample action consists of statistically choosing packets and sampling them using the psample module. @@ -778,9 +778,9 @@ config NET_ACT_SAMPLE module will be called act_sample. config NET_ACT_IPT - tristate "IPtables targets" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - ---help--- + tristate "IPtables targets" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + ---help--- Say Y here to be able to invoke iptables targets after successful classification. @@ -788,9 +788,9 @@ config NET_ACT_IPT module will be called act_ipt. config NET_ACT_NAT - tristate "Stateless NAT" - depends on NET_CLS_ACT - ---help--- + tristate "Stateless NAT" + depends on NET_CLS_ACT + ---help--- Say Y here to do stateless NAT on IPv4 packets. You should use netfilter for NAT unless you know what you are doing. @@ -798,18 +798,18 @@ config NET_ACT_NAT module will be called act_nat. config NET_ACT_PEDIT - tristate "Packet Editing" - depends on NET_CLS_ACT - ---help--- + tristate "Packet Editing" + depends on NET_CLS_ACT + ---help--- Say Y here if you want to mangle the content of packets. To compile this code as a module, choose M here: the module will be called act_pedit. config NET_ACT_SIMP - tristate "Simple Example (Debug)" - depends on NET_CLS_ACT - ---help--- + tristate "Simple Example (Debug)" + depends on NET_CLS_ACT + ---help--- Say Y here to add a simple action for demonstration purposes. It is meant as an example and for debugging purposes. It will print a configured policy string followed by the packet count @@ -821,9 +821,9 @@ config NET_ACT_SIMP module will be called act_simple. config NET_ACT_SKBEDIT - tristate "SKB Editing" - depends on NET_CLS_ACT - ---help--- + tristate "SKB Editing" + depends on NET_CLS_ACT + ---help--- Say Y here to change skb priority or queue_mapping settings. If unsure, say N. @@ -832,10 +832,10 @@ config NET_ACT_SKBEDIT module will be called act_skbedit. config NET_ACT_CSUM - tristate "Checksum Updating" - depends on NET_CLS_ACT && INET - select LIBCRC32C - ---help--- + tristate "Checksum Updating" + depends on NET_CLS_ACT && INET + select LIBCRC32C + ---help--- Say Y here to update some common checksum after some direct packet alterations. @@ -854,9 +854,9 @@ config NET_ACT_MPLS module will be called act_mpls. config NET_ACT_VLAN - tristate "Vlan manipulation" - depends on NET_CLS_ACT - ---help--- + tristate "Vlan manipulation" + depends on NET_CLS_ACT + ---help--- Say Y here to push or pop vlan headers. If unsure, say N. @@ -865,9 +865,9 @@ config NET_ACT_VLAN module will be called act_vlan. config NET_ACT_BPF - tristate "BPF based action" - depends on NET_CLS_ACT - ---help--- + tristate "BPF based action" + depends on NET_CLS_ACT + ---help--- Say Y here to execute BPF code on packets. The BPF code will decide if the packet should be dropped or not. @@ -877,10 +877,10 @@ config NET_ACT_BPF module will be called act_bpf. config NET_ACT_CONNMARK - tristate "Netfilter Connection Mark Retriever" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - depends on NF_CONNTRACK && NF_CONNTRACK_MARK - ---help--- + tristate "Netfilter Connection Mark Retriever" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + ---help--- Say Y here to allow retrieving of conn mark If unsure, say N. @@ -889,10 +889,10 @@ config NET_ACT_CONNMARK module will be called act_connmark. config NET_ACT_CTINFO - tristate "Netfilter Connection Mark Actions" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - depends on NF_CONNTRACK && NF_CONNTRACK_MARK - help + tristate "Netfilter Connection Mark Actions" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + help Say Y here to allow transfer of a connmark stored information. Current actions transfer connmark stored DSCP into ipv4/v6 diffserv and/or to transfer connmark to packet @@ -906,21 +906,21 @@ config NET_ACT_CTINFO module will be called act_ctinfo. config NET_ACT_SKBMOD - tristate "skb data modification action" - depends on NET_CLS_ACT - ---help--- - Say Y here to allow modification of skb data + tristate "skb data modification action" + depends on NET_CLS_ACT + ---help--- + Say Y here to allow modification of skb data - If unsure, say N. + If unsure, say N. - To compile this code as a module, choose M here: the - module will be called act_skbmod. + To compile this code as a module, choose M here: the + module will be called act_skbmod. config NET_ACT_IFE - tristate "Inter-FE action based on IETF ForCES InterFE LFB" - depends on NET_CLS_ACT - select NET_IFE - ---help--- + tristate "Inter-FE action based on IETF ForCES InterFE LFB" + depends on NET_CLS_ACT + select NET_IFE + ---help--- Say Y here to allow for sourcing and terminating metadata For details refer to netdev01 paper: "Distributing Linux Traffic Control Classifier-Action Subsystem" @@ -930,9 +930,9 @@ config NET_ACT_IFE module will be called act_ife. config NET_ACT_TUNNEL_KEY - tristate "IP tunnel metadata manipulation" - depends on NET_CLS_ACT - ---help--- + tristate "IP tunnel metadata manipulation" + depends on NET_CLS_ACT + ---help--- Say Y here to set/release ip tunnel metadata. If unsure, say N. @@ -941,9 +941,9 @@ config NET_ACT_TUNNEL_KEY module will be called act_tunnel_key. config NET_ACT_CT - tristate "connection tracking tc action" - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT - help + tristate "connection tracking tc action" + depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT + help Say Y here to allow sending the packets to conntrack module. If unsure, say N. @@ -952,16 +952,16 @@ config NET_ACT_CT module will be called act_ct. config NET_IFE_SKBMARK - tristate "Support to encoding decoding skb mark on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb mark on IFE action" + depends on NET_ACT_IFE config NET_IFE_SKBPRIO - tristate "Support to encoding decoding skb prio on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb prio on IFE action" + depends on NET_ACT_IFE config NET_IFE_SKBTCINDEX - tristate "Support to encoding decoding skb tcindex on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb tcindex on IFE action" + depends on NET_ACT_IFE config NET_TC_SKB_EXT bool "TC recirculation support" -- cgit From 3e8b9bfa110896f95d602d8c98d5f9d67e41d78c Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 23 Sep 2019 22:04:58 -0700 Subject: net/sched: cbs: Fix not adding cbs instance to list When removing a cbs instance when offloading is enabled, the crash below can be observed. The problem happens because that when offloading is enabled, the cbs instance is not added to the list. Also, the current code doesn't handle correctly the case when offload is disabled without removing the qdisc: if the link speed changes the credit calculations will be wrong. When we create the cbs instance with offloading enabled, it's not added to the notification list, when later we disable offloading, it's not in the list, so link speed changes will not affect it. The solution for both issues is the same, add the cbs instance being created unconditionally to the global list, even if the link state notification isn't useful "right now". Crash log: [518758.189866] BUG: kernel NULL pointer dereference, address: 0000000000000000 [518758.189870] #PF: supervisor read access in kernel mode [518758.189871] #PF: error_code(0x0000) - not-present page [518758.189872] PGD 0 P4D 0 [518758.189874] Oops: 0000 [#1] SMP PTI [518758.189876] CPU: 3 PID: 4825 Comm: tc Not tainted 5.2.9 #1 [518758.189877] Hardware name: Gigabyte Technology Co., Ltd. Z390 AORUS ULTRA/Z390 AORUS ULTRA-CF, BIOS F7 03/14/2019 [518758.189881] RIP: 0010:__list_del_entry_valid+0x29/0xa0 [518758.189883] Code: 90 48 b8 00 01 00 00 00 00 ad de 55 48 8b 17 4c 8b 47 08 48 89 e5 48 39 c2 74 27 48 b8 00 02 00 00 00 00 ad de 49 39 c0 74 2d <49> 8b 30 48 39 fe 75 3d 48 8b 52 08 48 39 f2 75 4c b8 01 00 00 00 [518758.189885] RSP: 0018:ffffa27e43903990 EFLAGS: 00010207 [518758.189887] RAX: dead000000000200 RBX: ffff8bce69f0f000 RCX: 0000000000000000 [518758.189888] RDX: 0000000000000000 RSI: ffff8bce69f0f064 RDI: ffff8bce69f0f1e0 [518758.189890] RBP: ffffa27e43903990 R08: 0000000000000000 R09: ffff8bce69e788c0 [518758.189891] R10: ffff8bce62acd400 R11: 00000000000003cb R12: ffff8bce69e78000 [518758.189892] R13: ffff8bce69f0f140 R14: 0000000000000000 R15: 0000000000000000 [518758.189894] FS: 00007fa1572c8f80(0000) GS:ffff8bce6e0c0000(0000) knlGS:0000000000000000 [518758.189895] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [518758.189896] CR2: 0000000000000000 CR3: 000000040a398006 CR4: 00000000003606e0 [518758.189898] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [518758.189899] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [518758.189900] Call Trace: [518758.189904] cbs_destroy+0x32/0xa0 [sch_cbs] [518758.189906] qdisc_destroy+0x45/0x120 [518758.189907] qdisc_put+0x25/0x30 [518758.189908] qdisc_graft+0x2c1/0x450 [518758.189910] tc_get_qdisc+0x1c8/0x310 [518758.189912] ? get_page_from_freelist+0x91a/0xcb0 [518758.189914] rtnetlink_rcv_msg+0x293/0x360 [518758.189916] ? kmem_cache_alloc_node_trace+0x178/0x260 [518758.189918] ? __kmalloc_node_track_caller+0x38/0x50 [518758.189920] ? rtnl_calcit.isra.0+0xf0/0xf0 [518758.189922] netlink_rcv_skb+0x48/0x110 [518758.189923] rtnetlink_rcv+0x10/0x20 [518758.189925] netlink_unicast+0x15b/0x1d0 [518758.189926] netlink_sendmsg+0x1ea/0x380 [518758.189929] sock_sendmsg+0x2f/0x40 [518758.189930] ___sys_sendmsg+0x295/0x2f0 [518758.189932] ? ___sys_recvmsg+0x151/0x1e0 [518758.189933] ? do_wp_page+0x7e/0x450 [518758.189935] __sys_sendmsg+0x48/0x80 [518758.189937] __x64_sys_sendmsg+0x1a/0x20 [518758.189939] do_syscall_64+0x53/0x1f0 [518758.189941] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [518758.189942] RIP: 0033:0x7fa15755169a [518758.189944] Code: 48 c7 c0 ff ff ff ff eb be 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 18 b8 2e 00 00 00 c5 fc 77 0f 05 <48> 3d 00 f0 ff ff 77 5e c3 0f 1f 44 00 00 48 83 ec 28 89 54 24 1c [518758.189946] RSP: 002b:00007ffda58b60b8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [518758.189948] RAX: ffffffffffffffda RBX: 000055e4b836d9a0 RCX: 00007fa15755169a [518758.189949] RDX: 0000000000000000 RSI: 00007ffda58b6128 RDI: 0000000000000003 [518758.189951] RBP: 00007ffda58b6190 R08: 0000000000000001 R09: 000055e4b9d848a0 [518758.189952] R10: 0000000000000000 R11: 0000000000000246 R12: 000000005d654b49 [518758.189953] R13: 0000000000000000 R14: 00007ffda58b6230 R15: 00007ffda58b6210 [518758.189955] Modules linked in: sch_cbs sch_etf sch_mqprio netlink_diag unix_diag e1000e igb intel_pch_thermal thermal video backlight pcc_cpufreq [518758.189960] CR2: 0000000000000000 [518758.189961] ---[ end trace 6a13f7aaf5376019 ]--- [518758.189963] RIP: 0010:__list_del_entry_valid+0x29/0xa0 [518758.189964] Code: 90 48 b8 00 01 00 00 00 00 ad de 55 48 8b 17 4c 8b 47 08 48 89 e5 48 39 c2 74 27 48 b8 00 02 00 00 00 00 ad de 49 39 c0 74 2d <49> 8b 30 48 39 fe 75 3d 48 8b 52 08 48 39 f2 75 4c b8 01 00 00 00 [518758.189967] RSP: 0018:ffffa27e43903990 EFLAGS: 00010207 [518758.189968] RAX: dead000000000200 RBX: ffff8bce69f0f000 RCX: 0000000000000000 [518758.189969] RDX: 0000000000000000 RSI: ffff8bce69f0f064 RDI: ffff8bce69f0f1e0 [518758.189971] RBP: ffffa27e43903990 R08: 0000000000000000 R09: ffff8bce69e788c0 [518758.189972] R10: ffff8bce62acd400 R11: 00000000000003cb R12: ffff8bce69e78000 [518758.189973] R13: ffff8bce69f0f140 R14: 0000000000000000 R15: 0000000000000000 [518758.189975] FS: 00007fa1572c8f80(0000) GS:ffff8bce6e0c0000(0000) knlGS:0000000000000000 [518758.189976] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [518758.189977] CR2: 0000000000000000 CR3: 000000040a398006 CR4: 00000000003606e0 [518758.189979] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [518758.189980] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: e0a7683d30e9 ("net/sched: cbs: fix port_rate miscalculation") Signed-off-by: Vinicius Costa Gomes Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 93b58fde99b7..1bef152c5721 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -392,7 +392,6 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory"); @@ -404,6 +403,10 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, if (!q->qdisc) return -ENOMEM; + spin_lock(&cbs_list_lock); + list_add(&q->cbs_list, &cbs_list); + spin_unlock(&cbs_list_lock); + qdisc_hash_add(q->qdisc, false); q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); @@ -413,17 +416,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, qdisc_watchdog_init(&q->watchdog, sch); - err = cbs_change(sch, opt, extack); - if (err) - return err; - - if (!q->offload) { - spin_lock(&cbs_list_lock); - list_add(&q->cbs_list, &cbs_list); - spin_unlock(&cbs_list_lock); - } - - return 0; + return cbs_change(sch, opt, extack); } static void cbs_destroy(struct Qdisc *sch) @@ -431,15 +424,18 @@ static void cbs_destroy(struct Qdisc *sch) struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - spin_lock(&cbs_list_lock); - list_del(&q->cbs_list); - spin_unlock(&cbs_list_lock); + /* Nothing to do if we couldn't create the underlying qdisc */ + if (!q->qdisc) + return; qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); - if (q->qdisc) - qdisc_put(q->qdisc); + spin_lock(&cbs_list_lock); + list_del(&q->cbs_list); + spin_unlock(&cbs_list_lock); + + qdisc_put(q->qdisc); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) -- cgit From adecda5bee0a05c11f1a4a4b16b01d11a832fd43 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 24 Sep 2019 11:09:37 +0200 Subject: net: print proper warning on dst underflow Proper warnings with stack traces make it much easier to figure out what's doing the double free and create more meaningful bug reports from users. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/core/dst.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index 1325316d9eab..193af526e908 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -172,7 +172,7 @@ void dst_release(struct dst_entry *dst) int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); - if (unlikely(newrefcnt < 0)) + if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt) @@ -187,7 +187,7 @@ void dst_release_immediate(struct dst_entry *dst) int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); - if (unlikely(newrefcnt < 0)) + if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt) -- cgit From ea8564c865299815095bebeb4b25bef474218e4c Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 24 Sep 2019 19:11:52 +0800 Subject: openvswitch: change type of UPCALL_PID attribute to NLA_UNSPEC userspace openvswitch patch "(dpif-linux: Implement the API functions to allow multiple handler threads read upcall)" changes its type from U32 to UNSPEC, but leave the kernel unchanged and after kernel 6e237d099fac "(netlink: Relax attr validation for fixed length types)", this bug is exposed by the below warning [ 57.215841] netlink: 'ovs-vswitchd': attribute type 5 has an invalid length. Fixes: 5cd667b0a456 ("openvswitch: Allow each vport to have an array of 'port_id's") Signed-off-by: Li RongQing Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index dde9d762edee..f30e406fbec5 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2294,7 +2294,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, -- cgit From ca7a03c4175366a92cee0ccc4fec0038c3266e26 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 24 Sep 2019 16:01:28 +0200 Subject: ipv6: do not free rt if FIB_LOOKUP_NOREF is set on suppress rule Commit 7d9e5f422150 removed references from certain dsts, but accounting for this never translated down into the fib6 suppression code. This bug was triggered by WireGuard users who use wg-quick(8), which uses the "suppress-prefix" directive to ip-rule(8) for routing all of their internet traffic without routing loops. The test case added here causes the reference underflow by causing packets to evaluate a suppress rule. Fixes: 7d9e5f422150 ("ipv6: convert major tx path to use RT6_LOOKUP_F_DST_NOREF") Signed-off-by: Jason A. Donenfeld Acked-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv6/fib6_rules.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index d22b6c140f23..f9e8fe3ff0c5 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -287,7 +287,8 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg return false; suppress_route: - ip6_rt_put(rt); + if (!(arg->flags & FIB_LOOKUP_NOREF)) + ip6_rt_put(rt); return true; } -- cgit From 0355d6c1d591b8f9e281783ec0cf95fbed893194 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 12:29:34 -0700 Subject: kcm: disable preemption in kcm_parse_func_strparser() After commit a2c11b034142 ("kcm: use BPF_PROG_RUN") syzbot easily triggers the warning in cant_sleep(). As explained in commit 6cab5e90ab2b ("bpf: run bpf programs with preemption disabled") we need to disable preemption before running bpf programs. BUG: assuming atomic context at net/kcm/kcmsock.c:382 in_atomic(): 0, irqs_disabled(): 0, pid: 7, name: kworker/u4:0 3 locks held by kworker/u4:0/7: #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: __write_once_size include/linux/compiler.h:226 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: atomic64_set include/asm-generic/atomic-instrumented.h:855 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: atomic_long_set include/asm-generic/atomic-long.h:40 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: set_work_data kernel/workqueue.c:620 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: process_one_work+0x88b/0x1740 kernel/workqueue.c:2240 #1: ffff8880a989fdc0 ((work_completion)(&strp->work)){+.+.}, at: process_one_work+0x8c1/0x1740 kernel/workqueue.c:2244 #2: ffff888098998d10 (sk_lock-AF_INET){+.+.}, at: lock_sock include/net/sock.h:1522 [inline] #2: ffff888098998d10 (sk_lock-AF_INET){+.+.}, at: strp_sock_lock+0x2e/0x40 net/strparser/strparser.c:440 CPU: 0 PID: 7 Comm: kworker/u4:0 Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: kstrp strp_work Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 __cant_sleep kernel/sched/core.c:6826 [inline] __cant_sleep.cold+0xa4/0xbc kernel/sched/core.c:6803 kcm_parse_func_strparser+0x54/0x200 net/kcm/kcmsock.c:382 __strp_recv+0x5dc/0x1b20 net/strparser/strparser.c:221 strp_recv+0xcf/0x10b net/strparser/strparser.c:343 tcp_read_sock+0x285/0xa00 net/ipv4/tcp.c:1639 strp_read_sock+0x14d/0x200 net/strparser/strparser.c:366 do_strp_work net/strparser/strparser.c:414 [inline] strp_work+0xe3/0x130 net/strparser/strparser.c:423 process_one_work+0x9af/0x1740 kernel/workqueue.c:2269 Fixes: a2c11b034142 ("kcm: use BPF_PROG_RUN") Fixes: 6cab5e90ab2b ("bpf: run bpf programs with preemption disabled") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 8f12f5c6ab87..ea9e73428ed9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -378,8 +378,12 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct bpf_prog *prog = psock->bpf_prog; + int res; - return BPF_PROG_RUN(prog, skb); + preempt_disable(); + res = BPF_PROG_RUN(prog, skb); + preempt_enable(); + return res; } static int kcm_read_sock_done(struct strparser *strp, int err) -- cgit From 159d2c7d8106177bd9a986fd005a311fe0d11285 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 13:11:26 -0700 Subject: sch_netem: fix rcu splat in netem_enqueue() qdisc_root() use from netem_enqueue() triggers a lockdep warning. __dev_queue_xmit() uses rcu_read_lock_bh() which is not equivalent to rcu_read_lock() + local_bh_disable_bh as far as lockdep is concerned. WARNING: suspicious RCU usage 5.3.0-rc7+ #0 Not tainted ----------------------------- include/net/sch_generic.h:492 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by syz-executor427/8855: #0: 00000000b5525c01 (rcu_read_lock_bh){....}, at: lwtunnel_xmit_redirect include/net/lwtunnel.h:92 [inline] #0: 00000000b5525c01 (rcu_read_lock_bh){....}, at: ip_finish_output2+0x2dc/0x2570 net/ipv4/ip_output.c:214 #1: 00000000b5525c01 (rcu_read_lock_bh){....}, at: __dev_queue_xmit+0x20a/0x3650 net/core/dev.c:3804 #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: spin_lock include/linux/spinlock.h:338 [inline] #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: __dev_xmit_skb net/core/dev.c:3502 [inline] #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: __dev_queue_xmit+0x14b8/0x3650 net/core/dev.c:3838 stack backtrace: CPU: 0 PID: 8855 Comm: syz-executor427 Not tainted 5.3.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 lockdep_rcu_suspicious+0x153/0x15d kernel/locking/lockdep.c:5357 qdisc_root include/net/sch_generic.h:492 [inline] netem_enqueue+0x1cfb/0x2d80 net/sched/sch_netem.c:479 __dev_xmit_skb net/core/dev.c:3527 [inline] __dev_queue_xmit+0x15d2/0x3650 net/core/dev.c:3838 dev_queue_xmit+0x18/0x20 net/core/dev.c:3902 neigh_hh_output include/net/neighbour.h:500 [inline] neigh_output include/net/neighbour.h:509 [inline] ip_finish_output2+0x1726/0x2570 net/ipv4/ip_output.c:228 __ip_finish_output net/ipv4/ip_output.c:308 [inline] __ip_finish_output+0x5fc/0xb90 net/ipv4/ip_output.c:290 ip_finish_output+0x38/0x1f0 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip_mc_output+0x292/0xf40 net/ipv4/ip_output.c:417 dst_output include/net/dst.h:436 [inline] ip_local_out+0xbb/0x190 net/ipv4/ip_output.c:125 ip_send_skb+0x42/0xf0 net/ipv4/ip_output.c:1555 udp_send_skb.isra.0+0x6b2/0x1160 net/ipv4/udp.c:887 udp_sendmsg+0x1e96/0x2820 net/ipv4/udp.c:1174 inet_sendmsg+0x9e/0xe0 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2439 do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index f5cb35e550f8..0e44039e729c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -476,7 +476,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * skb will be queued. */ if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { - struct Qdisc *rootq = qdisc_root(sch); + struct Qdisc *rootq = qdisc_root_bh(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; -- cgit From 4f6570d7206bb052f42718d55fbe72977f0318ea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 08:01:14 -0700 Subject: ipv6: add priority parameter to ip6_xmit() Currently, ip6_xmit() sets skb->priority based on sk->sk_priority This is not desirable for TCP since TCP shares the same ctl socket for a given netns. We want to be able to send RST or ACK packets with a non zero skb->priority. This patch has no functional change. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv6.c | 5 +++-- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 4 ++-- net/ipv6/tcp_ipv6.c | 6 ++++-- net/sctp/ipv6.c | 2 +- 5 files changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 1b7381ff787b..25aab672fc99 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -230,7 +230,8 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass); + err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass, + sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -284,7 +285,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0); + ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); DCCP_INC_STATS(DCCP_MIB_OUTRSTS); return; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 4da24aa6c696..0a0945a5b30d 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -133,7 +133,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused fl6.daddr = sk->sk_v6_daddr; res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), - np->tclass); + np->tclass, sk->sk_priority); rcu_read_unlock(); return res; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 89a4c7c2e25d..edadee4a7e76 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -193,7 +193,7 @@ bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) * which are using proper atomic operations or spinlocks. */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - __u32 mark, struct ipv6_txoptions *opt, int tclass) + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); @@ -258,7 +258,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hdr->daddr = *first_hop; skb->protocol = htons(ETH_P_IPV6); - skb->priority = sk->sk_priority; + skb->priority = priority; skb->mark = mark; mtu = dst_mtu(dst); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 87f44d3250ee..806064c28867 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -512,7 +512,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass, + sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -907,7 +908,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); - ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass); + ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, + 0); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e5f2fc726a98..dd860fea0148 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -215,7 +215,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - tclass); + tclass, sk->sk_priority); rcu_read_unlock(); return res; } -- cgit From e9a5dceee56cb527a3498f1a59bd8726baa1e717 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 08:01:15 -0700 Subject: ipv6: tcp: provide sk->sk_priority to ctl packets We can populate skb->priority for some ctl packets instead of always using zero. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 806064c28867..5f557bf27da2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -804,7 +804,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int rst, - u8 tclass, __be32 label) + u8 tclass, __be32 label, u32 priority) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; @@ -909,7 +909,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, - 0); + priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -932,6 +932,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) struct sock *sk1 = NULL; #endif __be32 label = 0; + u32 priority = 0; struct net *net; int oif = 0; @@ -992,6 +993,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) trace_tcp_send_reset(sk, skb); if (np->repflow) label = ip6_flowlabel(ipv6h); + priority = sk->sk_priority; } if (sk->sk_state == TCP_TIME_WAIT) label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); @@ -1001,7 +1003,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) } tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, - label); + label, priority); #ifdef CONFIG_TCP_MD5SIG out: @@ -1012,10 +1014,10 @@ out: static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, - __be32 label) + __be32 label, u32 priority) { tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0, - tclass, label); + tclass, label, priority); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -1027,7 +1029,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), 0); inet_twsk_put(tw); } @@ -1050,7 +1052,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), - 0, 0); + 0, 0, sk->sk_priority); } -- cgit From f6c0f5d209fa80eb808e08aa4206f6e264041ef6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 08:01:16 -0700 Subject: tcp: honor SO_PRIORITY in TIME_WAIT state ctl packets sent on behalf of TIME_WAIT sockets currently have a zero skb->priority, which can cause various problems. In this patch we : - add a tw_priority field in struct inet_timewait_sock. - populate it from sk->sk_priority when a TIME_WAIT is created. - For IPv4, change ip_send_unicast_reply() and its two callers to propagate tw_priority correctly. ip_send_unicast_reply() no longer changes sk->sk_priority. - For IPv6, make sure TIME_WAIT sockets pass their tw_priority field to tcp_v6_send_response() and tcp_v6_send_ack(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 1 - net/ipv4/tcp_ipv4.c | 4 ++++ net/ipv4/tcp_minisocks.c | 1 + net/ipv6/tcp_ipv6.c | 6 ++++-- 4 files changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a77c3a4c24de..28fca408812c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1694,7 +1694,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, inet_sk(sk)->tos = arg->tos; - sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_sndbuf = sysctl_wmem_default; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fd394ad179a0..2ee45e3755e9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -771,6 +771,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); } ip_send_unicast_reply(ctl_sk, @@ -866,6 +868,8 @@ static void tcp_v4_send_ack(const struct sock *sk, ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8bcaf2586b68..bb140a5db8c0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -266,6 +266,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_transparent = inet->transparent; tw->tw_mark = sk->sk_mark; + tw->tw_priority = sk->sk_priority; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f557bf27da2..e3d9f4559c99 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -995,8 +995,10 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) label = ip6_flowlabel(ipv6h); priority = sk->sk_priority; } - if (sk->sk_state == TCP_TIME_WAIT) + if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); + priority = inet_twsk(sk)->tw_priority; + } } else { if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); @@ -1029,7 +1031,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), 0); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority); inet_twsk_put(tw); } -- cgit From 05733434ee9ae6548723a808647248583e347cca Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Tue, 24 Sep 2019 08:51:16 -0700 Subject: net/rds: Check laddr_check before calling it In rds_bind(), laddr_check is called without checking if it is NULL or not. And rs_transport should be reset if rds_add_bound() fails. Fixes: c5c1a030a7db ("net/rds: An rds_sock is added too early to the hash table") Reported-by: syzbot+fae39afd2101a17ec624@syzkaller.appspotmail.com Signed-off-by: Ka-Cheong Poon Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/bind.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/bind.c b/net/rds/bind.c index 20c156a73e73..5b5fb4ca8d3e 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -244,7 +244,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ if (rs->rs_transport) { trans = rs->rs_transport; - if (trans->laddr_check(sock_net(sock->sk), + if (!trans->laddr_check || + trans->laddr_check(sock_net(sock->sk), binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; goto out; @@ -263,6 +264,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sock_set_flag(sk, SOCK_RCU_FREE); ret = rds_add_bound(rs, binding_addr, &port, scope_id); + if (ret) + rs->rs_transport = NULL; out: release_sock(sk); -- cgit From 4ce70b4aed5752332b268909336b351721965dc4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Sep 2019 18:51:16 +0300 Subject: net: sched: sch_htb: don't call qdisc_put() while holding tree lock Recent changes that removed rtnl dependency from rules update path of tc also made tcf_block_put() function sleeping. This function is called from ops->destroy() of several Qdisc implementations, which in turn is called by qdisc_put(). Some Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Steps to reproduce for htb: tc qdisc add dev ens1f0 root handle 1: htb default 12 tc class add dev ens1f0 parent 1: classid 1:1 htb rate 100kbps ceil 100kbps tc qdisc add dev ens1f0 parent 1:1 handle 40: sfq perturb 10 tc class add dev ens1f0 parent 1:1 classid 1:2 htb rate 100kbps ceil 100kbps Resulting dmesg: [ 4791.148551] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 4791.151354] in_atomic(): 1, irqs_disabled(): 0, pid: 27273, name: tc [ 4791.152805] INFO: lockdep is turned off. [ 4791.153605] CPU: 19 PID: 27273 Comm: tc Tainted: G W 5.3.0-rc8+ #721 [ 4791.154336] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 4791.155075] Call Trace: [ 4791.155803] dump_stack+0x85/0xc0 [ 4791.156529] ___might_sleep.cold+0xac/0xbc [ 4791.157251] __mutex_lock+0x5b/0x960 [ 4791.157966] ? console_unlock+0x363/0x5d0 [ 4791.158676] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 4791.159395] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 4791.160103] tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 4791.160815] tcf_block_put_ext.part.0+0x21/0x50 [ 4791.161530] tcf_block_put+0x50/0x70 [ 4791.162233] sfq_destroy+0x15/0x50 [sch_sfq] [ 4791.162936] qdisc_destroy+0x5f/0x160 [ 4791.163642] htb_change_class.cold+0x5df/0x69d [sch_htb] [ 4791.164505] tc_ctl_tclass+0x19d/0x480 [ 4791.165360] rtnetlink_rcv_msg+0x170/0x4b0 [ 4791.166191] ? netlink_deliver_tap+0x95/0x400 [ 4791.166907] ? rtnl_dellink+0x2d0/0x2d0 [ 4791.167625] netlink_rcv_skb+0x49/0x110 [ 4791.168345] netlink_unicast+0x171/0x200 [ 4791.169058] netlink_sendmsg+0x224/0x3f0 [ 4791.169771] sock_sendmsg+0x5e/0x60 [ 4791.170475] ___sys_sendmsg+0x2ae/0x330 [ 4791.171183] ? ___sys_recvmsg+0x159/0x1f0 [ 4791.171894] ? do_wp_page+0x9c/0x790 [ 4791.172595] ? __handle_mm_fault+0xcd3/0x19e0 [ 4791.173309] __sys_sendmsg+0x59/0xa0 [ 4791.174024] do_syscall_64+0x5c/0xb0 [ 4791.174725] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 4791.175435] RIP: 0033:0x7f0aa41497b8 [ 4791.176129] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 8f 0c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 89 5 4 [ 4791.177532] RSP: 002b:00007fff4e37d588 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 4791.178243] RAX: ffffffffffffffda RBX: 000000005d8132f7 RCX: 00007f0aa41497b8 [ 4791.178947] RDX: 0000000000000000 RSI: 00007fff4e37d5f0 RDI: 0000000000000003 [ 4791.179662] RBP: 0000000000000000 R08: 0000000000000001 R09: 00000000020149a0 [ 4791.180382] R10: 0000000000404eda R11: 0000000000000246 R12: 0000000000000001 [ 4791.181100] R13: 000000000047f640 R14: 0000000000000000 R15: 0000000000000000 In htb_change_class() function save parent->leaf.q to local temporary variable and put reference to it after sch tree lock is released in order not to call potentially sleeping cls API in atomic section. This is safe to do because Qdisc has already been reset by qdisc_purge_queue() inside sch tree lock critical section. Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 7bcf20ef9145..8184c87da8be 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1302,6 +1302,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_HTB_MAX + 1]; + struct Qdisc *parent_qdisc = NULL; struct tc_htb_opt *hopt; u64 rate64, ceil64; int warn = 0; @@ -1401,7 +1402,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (parent && !parent->level) { /* turn parent into inner node */ qdisc_purge_queue(parent->leaf.q); - qdisc_put(parent->leaf.q); + parent_qdisc = parent->leaf.q; if (parent->prio_activity) htb_deactivate(q, parent); @@ -1480,6 +1481,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); sch_tree_unlock(sch); + qdisc_put(parent_qdisc); if (warn) pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n", -- cgit From c2999f7fb05b87da4060e38150c70fa46794d82b Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Sep 2019 18:51:17 +0300 Subject: net: sched: multiq: don't call qdisc_put() while holding tree lock Recent changes that removed rtnl dependency from rules update path of tc also made tcf_block_put() function sleeping. This function is called from ops->destroy() of several Qdisc implementations, which in turn is called by qdisc_put(). Some Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Steps to reproduce for multiq: tc qdisc add dev ens1f0 root handle 1: multiq tc qdisc add dev ens1f0 parent 1:10 handle 50: sfq perturb 10 ethtool -L ens1f0 combined 2 tc qdisc change dev ens1f0 root handle 1: multiq Resulting dmesg: [ 5539.419344] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 5539.420945] in_atomic(): 1, irqs_disabled(): 0, pid: 27658, name: tc [ 5539.422435] INFO: lockdep is turned off. [ 5539.423904] CPU: 21 PID: 27658 Comm: tc Tainted: G W 5.3.0-rc8+ #721 [ 5539.425400] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 5539.426911] Call Trace: [ 5539.428380] dump_stack+0x85/0xc0 [ 5539.429823] ___might_sleep.cold+0xac/0xbc [ 5539.431262] __mutex_lock+0x5b/0x960 [ 5539.432682] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 5539.434103] ? __nla_validate_parse+0x51/0x840 [ 5539.435493] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 5539.436903] tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 5539.438327] tcf_block_put_ext.part.0+0x21/0x50 [ 5539.439752] tcf_block_put+0x50/0x70 [ 5539.441165] sfq_destroy+0x15/0x50 [sch_sfq] [ 5539.442570] qdisc_destroy+0x5f/0x160 [ 5539.444000] multiq_tune+0x14a/0x420 [sch_multiq] [ 5539.445421] tc_modify_qdisc+0x324/0x840 [ 5539.446841] rtnetlink_rcv_msg+0x170/0x4b0 [ 5539.448269] ? netlink_deliver_tap+0x95/0x400 [ 5539.449691] ? rtnl_dellink+0x2d0/0x2d0 [ 5539.451116] netlink_rcv_skb+0x49/0x110 [ 5539.452522] netlink_unicast+0x171/0x200 [ 5539.453914] netlink_sendmsg+0x224/0x3f0 [ 5539.455304] sock_sendmsg+0x5e/0x60 [ 5539.456686] ___sys_sendmsg+0x2ae/0x330 [ 5539.458071] ? ___sys_recvmsg+0x159/0x1f0 [ 5539.459461] ? do_wp_page+0x9c/0x790 [ 5539.460846] ? __handle_mm_fault+0xcd3/0x19e0 [ 5539.462263] __sys_sendmsg+0x59/0xa0 [ 5539.463661] do_syscall_64+0x5c/0xb0 [ 5539.465044] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 5539.466454] RIP: 0033:0x7f1fe08177b8 [ 5539.467863] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 8f 0c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 89 5 4 [ 5539.470906] RSP: 002b:00007ffe812de5d8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 5539.472483] RAX: ffffffffffffffda RBX: 000000005d8135e3 RCX: 00007f1fe08177b8 [ 5539.474069] RDX: 0000000000000000 RSI: 00007ffe812de640 RDI: 0000000000000003 [ 5539.475655] RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000182e9b0 [ 5539.477203] R10: 0000000000404eda R11: 0000000000000246 R12: 0000000000000001 [ 5539.478699] R13: 000000000047f640 R14: 0000000000000000 R15: 0000000000000000 Rearrange locking in multiq_tune() in following ways: - In loop that removes Qdiscs from disabled queues, call qdisc_purge_queue() instead of qdisc_tree_flush_backlog() on Qdisc that is being destroyed. Save the Qdisc in temporary allocated array and call qdisc_put() on each element of the array after sch tree lock is released. This is safe to do because Qdiscs have already been reset by qdisc_purge_queue() inside sch tree lock critical section. - Do the same change for second loop that initializes Qdiscs for newly enabled queues in multiq_tune() function. Since sch tree lock is obtained and released on each iteration of this loop, just call qdisc_put() directly outside of critical section. Don't verify that old Qdisc is not noop_qdisc before releasing reference to it because such check is already performed by qdisc_put*() functions. Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/sch_multiq.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index e1087746f6a2..b2b7fdb06fc6 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -174,7 +174,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, { struct multiq_sched_data *q = qdisc_priv(sch); struct tc_multiq_qopt *qopt; - int i; + struct Qdisc **removed; + int i, n_removed = 0; if (!netif_is_multiqueue(qdisc_dev(sch))) return -EOPNOTSUPP; @@ -185,6 +186,11 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, qopt->bands = qdisc_dev(sch)->real_num_tx_queues; + removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands), + GFP_KERNEL); + if (!removed) + return -ENOMEM; + sch_tree_lock(sch); q->bands = qopt->bands; for (i = q->bands; i < q->max_bands; i++) { @@ -192,13 +198,17 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; - qdisc_tree_flush_backlog(child); - qdisc_put(child); + qdisc_purge_queue(child); + removed[n_removed++] = child; } } sch_tree_unlock(sch); + for (i = 0; i < n_removed; i++) + qdisc_put(removed[i]); + kfree(removed); + for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; @@ -213,11 +223,10 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, if (child != &noop_qdisc) qdisc_hash_add(child, true); - if (old != &noop_qdisc) { - qdisc_tree_flush_backlog(old); - qdisc_put(old); - } + if (old != &noop_qdisc) + qdisc_purge_queue(old); sch_tree_unlock(sch); + qdisc_put(old); } } } -- cgit From e3ae1f96accd21405715fe9c56b4d83bc7d96d44 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Sep 2019 18:51:18 +0300 Subject: net: sched: sch_sfb: don't call qdisc_put() while holding tree lock Recent changes that removed rtnl dependency from rules update path of tc also made tcf_block_put() function sleeping. This function is called from ops->destroy() of several Qdisc implementations, which in turn is called by qdisc_put(). Some Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Steps to reproduce for sfb: tc qdisc add dev ens1f0 handle 1: root sfb tc qdisc add dev ens1f0 parent 1:10 handle 50: sfq perturb 10 tc qdisc change dev ens1f0 root handle 1: sfb Resulting dmesg: [ 7265.938717] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 7265.940152] in_atomic(): 1, irqs_disabled(): 0, pid: 28579, name: tc [ 7265.941455] INFO: lockdep is turned off. [ 7265.942744] CPU: 11 PID: 28579 Comm: tc Tainted: G W 5.3.0-rc8+ #721 [ 7265.944065] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 7265.945396] Call Trace: [ 7265.946709] dump_stack+0x85/0xc0 [ 7265.947994] ___might_sleep.cold+0xac/0xbc [ 7265.949282] __mutex_lock+0x5b/0x960 [ 7265.950543] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 7265.951803] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 7265.953022] tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 7265.954248] tcf_block_put_ext.part.0+0x21/0x50 [ 7265.955478] tcf_block_put+0x50/0x70 [ 7265.956694] sfq_destroy+0x15/0x50 [sch_sfq] [ 7265.957898] qdisc_destroy+0x5f/0x160 [ 7265.959099] sfb_change+0x175/0x330 [sch_sfb] [ 7265.960304] tc_modify_qdisc+0x324/0x840 [ 7265.961503] rtnetlink_rcv_msg+0x170/0x4b0 [ 7265.962692] ? netlink_deliver_tap+0x95/0x400 [ 7265.963876] ? rtnl_dellink+0x2d0/0x2d0 [ 7265.965064] netlink_rcv_skb+0x49/0x110 [ 7265.966251] netlink_unicast+0x171/0x200 [ 7265.967427] netlink_sendmsg+0x224/0x3f0 [ 7265.968595] sock_sendmsg+0x5e/0x60 [ 7265.969753] ___sys_sendmsg+0x2ae/0x330 [ 7265.970916] ? ___sys_recvmsg+0x159/0x1f0 [ 7265.972074] ? do_wp_page+0x9c/0x790 [ 7265.973233] ? __handle_mm_fault+0xcd3/0x19e0 [ 7265.974407] __sys_sendmsg+0x59/0xa0 [ 7265.975591] do_syscall_64+0x5c/0xb0 [ 7265.976753] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 7265.977938] RIP: 0033:0x7f229069f7b8 [ 7265.979117] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 8f 0c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 89 5 4 [ 7265.981681] RSP: 002b:00007ffd7ed2d158 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 7265.983001] RAX: ffffffffffffffda RBX: 000000005d813ca1 RCX: 00007f229069f7b8 [ 7265.984336] RDX: 0000000000000000 RSI: 00007ffd7ed2d1c0 RDI: 0000000000000003 [ 7265.985682] RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000165c9a0 [ 7265.987021] R10: 0000000000404eda R11: 0000000000000246 R12: 0000000000000001 [ 7265.988309] R13: 000000000047f640 R14: 0000000000000000 R15: 0000000000000000 In sfb_change() function use qdisc_purge_queue() instead of qdisc_tree_flush_backlog() to properly reset old child Qdisc and save pointer to it into local temporary variable. Put reference to Qdisc after sch tree lock is released in order not to call potentially sleeping cls API in atomic section. This is safe to do because Qdisc has already been reset by qdisc_purge_queue() inside sch tree lock critical section. Reported-by: syzbot+ac54455281db908c581e@syzkaller.appspotmail.com Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Suggested-by: Cong Wang Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/sch_sfb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 1dff8506a715..d448fe3068e5 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -488,7 +488,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); - struct Qdisc *child; + struct Qdisc *child, *old; struct nlattr *tb[TCA_SFB_MAX + 1]; const struct tc_sfb_qopt *ctl = &sfb_default_ops; u32 limit; @@ -518,8 +518,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, qdisc_hash_add(child, true); sch_tree_lock(sch); - qdisc_tree_flush_backlog(q->qdisc); - qdisc_put(q->qdisc); + qdisc_purge_queue(q->qdisc); + old = q->qdisc; q->qdisc = child; q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); @@ -542,6 +542,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, sfb_init_perturbation(1, q); sch_tree_unlock(sch); + qdisc_put(old); return 0; } -- cgit From dfe5999dc03e321d08190772c98843284d5cf419 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 25 Sep 2019 18:02:35 +0300 Subject: net/sched: Set default of CONFIG_NET_TC_SKB_EXT to N This a new feature, it is preferred that it defaults to N. We will probe the feature support from userspace before actually using it. Fixes: 95a7233c452a ('net: openvswitch: Set OvS recirc_id from tc chain index') Signed-off-by: Paul Blakey Signed-off-by: David S. Miller --- net/sched/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 5b044ae6dc1e..2985509147a2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -966,7 +966,6 @@ config NET_IFE_SKBTCINDEX config NET_TC_SKB_EXT bool "TC recirculation support" depends on NET_CLS_ACT - default y if NET_CLS_ACT select SKB_EXTENSIONS help -- cgit From 6b3656a60f2067738d1a423328199720806f0c44 Mon Sep 17 00:00:00 2001 From: "Kevin(Yudong) Yang" Date: Thu, 26 Sep 2019 10:30:05 -0400 Subject: tcp_bbr: fix quantization code to not raise cwnd if not probing bandwidth There was a bug in the previous logic that attempted to ensure gain cycling gets inflight above BDP even for small BDPs. This code correctly raised and lowered target inflight values during the gain cycle. And this code correctly ensured that cwnd was raised when probing bandwidth. However, it did not correspondingly ensure that cwnd was *not* raised in this way when *not* probing for bandwidth. The result was that small-BDP flows that were always cwnd-bound could go for many cycles with a fixed cwnd, and not probe or yield bandwidth at all. This meant that multiple small-BDP flows could fail to converge in their bandwidth allocations. Fixes: 3c346b233c68 ("tcp_bbr: fix bw probing to raise in-flight data for very small BDPs") Signed-off-by: Kevin(Yudong) Yang Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Acked-by: Priyaranjan Jha Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 95b59540eee1..32772d6ded4e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -388,7 +388,7 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) * which allows 2 outstanding 2-packet sequences, to try to keep pipe * full even with ACK-every-other-packet delayed ACKs. */ -static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) { struct bbr *bbr = inet_csk_ca(sk); @@ -399,7 +399,7 @@ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) cwnd = (cwnd + 1) & ~1U; /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ - if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) cwnd += 2; return cwnd; @@ -411,7 +411,7 @@ static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) u32 inflight; inflight = bbr_bdp(sk, bw, gain); - inflight = bbr_quantization_budget(sk, inflight, gain); + inflight = bbr_quantization_budget(sk, inflight); return inflight; } @@ -531,7 +531,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, * due to aggregation (of data and/or ACKs) visible in the ACK stream. */ target_cwnd += bbr_ack_aggregation_cwnd(sk); - target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); /* If we're below target cwnd, slow start cwnd toward target cwnd. */ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ -- cgit From 174e23810cd3183dc2ca3f5166ef965a55eaaf54 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Sep 2019 20:37:05 +0200 Subject: sk_buff: drop all skb extensions on free and skb scrubbing Now that we have a 3rd extension, add a new helper that drops the extension space and use it when we need to scrub an sk_buff. At this time, scrubbing clears secpath and bridge netfilter data, but retains the tc skb extension, after this patch all three get cleared. NAPI reuse/free assumes we can only have a secpath attached to skb, but it seems better to clear all extensions there as well. v2: add unlikely hint (Eric Dumazet) Fixes: 95a7233c452a ("net: openvswitch: Set OvS recirc_id from tc chain index") Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/skbuff.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 71b18e80389f..bf3ed413abaf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5666,7 +5666,7 @@ EXPORT_SYMBOL(gro_find_complete_by_type); static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); - secpath_reset(skb); + skb_ext_put(skb); kmem_cache_free(skbuff_head_cache, skb); } @@ -5733,7 +5733,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - secpath_reset(skb); + skb_ext_reset(skb); napi->skb = skb; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f12e8a050edb..01d65206f4fb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5119,7 +5119,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - secpath_reset(skb); + skb_ext_reset(skb); nf_reset(skb); nf_reset_trace(skb); -- cgit From a41e8a88b06ee39fad4cb4a8ccf822563560a89c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Sep 2019 15:42:51 -0700 Subject: tcp: better handle TCP_USER_TIMEOUT in SYN_SENT state Yuchung Cheng and Marek Majkowski independently reported a weird behavior of TCP_USER_TIMEOUT option when used at connect() time. When the TCP_USER_TIMEOUT is reached, tcp_write_timeout() believes the flow should live, and the following condition in tcp_clamp_rto_to_user_timeout() programs one jiffie timers : remaining = icsk->icsk_user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ This silly situation ends when the max syn rtx count is reached. This patch makes sure we honor both TCP_SYNCNT and TCP_USER_TIMEOUT, avoiding these spurious SYN packets. Fixes: b701a99e431d ("tcp: Add tcp_clamp_rto_to_user_timeout() helper to improve accuracy") Signed-off-by: Eric Dumazet Reported-by: Yuchung Cheng Reported-by: Marek Majkowski Cc: Jon Maxwell Link: https://marc.info/?l=linux-netdev&m=156940118307949&w=2 Acked-by: Jon Maxwell Tested-by: Marek Majkowski Signed-off-by: Marek Majkowski Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index dbd9d2d0ee63..40de2d2364a1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -210,7 +210,7 @@ static int tcp_write_timeout(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - bool expired, do_reset; + bool expired = false, do_reset; int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { @@ -242,9 +242,10 @@ static int tcp_write_timeout(struct sock *sk) if (tcp_out_of_resources(sk, do_reset)) return 1; } + } + if (!expired) expired = retransmits_timed_out(sk, retry_until, icsk->icsk_user_timeout); - } tcp_fastopen_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) -- cgit