From 543e3a8da5a4c453e992d5351ef405d5e32f27d7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 24 Mar 2016 21:56:21 -0500 Subject: netpoll: Fix extra refcount release in netpoll_cleanup() netpoll_setup() does a dev_hold() on np->dev, the netpoll device. If it fails, it correctly does a dev_put() but leaves np->dev set. If we call netpoll_cleanup() after the failure, np->dev is still set so we do another dev_put(), which decrements the refcount an extra time. It's questionable to call netpoll_cleanup() after netpoll_setup() fails, but it can be difficult to find the problem, and we can easily avoid it in this case. The extra decrements can lead to hangs like this: unregister_netdevice: waiting for bond0 to become free. Usage count = -3 Set and clear np->dev at the points where we dev_hold() and dev_put() the device. Signed-off-by: Bjorn Helgaas Signed-off-by: David S. Miller --- net/core/netpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 94acfc89ad97..a57bd17805b4 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -603,7 +603,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) const struct net_device_ops *ops; int err; - np->dev = ndev; strlcpy(np->dev_name, ndev->name, IFNAMSIZ); INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); @@ -670,6 +669,7 @@ int netpoll_setup(struct netpoll *np) goto unlock; } dev_hold(ndev); + np->dev = ndev; if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); @@ -770,6 +770,7 @@ int netpoll_setup(struct netpoll *np) return 0; put: + np->dev = NULL; dev_put(ndev); unlock: rtnl_unlock(); -- cgit From c0e760c9c66ebd8a5a1ede81868677f4df993dfb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Mar 2016 00:02:00 +0200 Subject: bpf: make padding in bpf_tunnel_key explicit Make the 2 byte padding in struct bpf_tunnel_key between tunnel_ttl and tunnel_label members explicit. No issue has been observed, and gcc/llvm does padding for the old struct already, where tunnel_label was not yet present, so the current code works, but since it's part of uapi, make sure we don't introduce holes in structs. Therefore, add tunnel_ext that we can use generically in future (f.e. to flag OAM messages for backends, etc). Also add the offset to the compat tests to be sure should some compilers not padd the tail of the old version of bpf_tunnel_key. Fixes: 4018ab1875e0 ("bpf: support flow label for bpf_skb_{set, get}_tunnel_key") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index b7177d01ecb0..4b81b71171b4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1764,6 +1764,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have @@ -1849,6 +1850,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. @@ -1861,7 +1863,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EINVAL; } } - if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label)) + if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || + from->tunnel_ext)) return -EINVAL; skb_dst_drop(skb); -- cgit From c57c7a95da842807b475b823ed2e5435c42cb3b0 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 31 Mar 2016 18:10:31 +0200 Subject: rtnl: fix msg size calculation in if_nlmsg_size() Size of the attribute IFLA_PHYS_PORT_NAME was missing. Fixes: db24a9044ee1 ("net: add support for phys_port_name") CC: David Ahern Signed-off-by: Nicolas Dichtel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f2066772d0f3..a75f7e94b445 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -909,6 +909,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } -- cgit From 5a5abb1fa3b05dd6aa821525832644c1e7d2905f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 31 Mar 2016 02:13:18 +0200 Subject: tun, bpf: fix suspicious RCU usage in tun_{attach, detach}_filter Sasha Levin reported a suspicious rcu_dereference_protected() warning found while fuzzing with trinity that is similar to this one: [ 52.765684] net/core/filter.c:2262 suspicious rcu_dereference_protected() usage! [ 52.765688] other info that might help us debug this: [ 52.765695] rcu_scheduler_active = 1, debug_locks = 1 [ 52.765701] 1 lock held by a.out/1525: [ 52.765704] #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x17/0x20 [ 52.765721] stack backtrace: [ 52.765728] CPU: 1 PID: 1525 Comm: a.out Not tainted 4.5.0+ #264 [...] [ 52.765768] Call Trace: [ 52.765775] [] dump_stack+0x85/0xc8 [ 52.765784] [] lockdep_rcu_suspicious+0xd5/0x110 [ 52.765792] [] sk_detach_filter+0x82/0x90 [ 52.765801] [] tun_detach_filter+0x35/0x90 [tun] [ 52.765810] [] __tun_chr_ioctl+0x354/0x1130 [tun] [ 52.765818] [] ? selinux_file_ioctl+0x130/0x210 [ 52.765827] [] tun_chr_ioctl+0x13/0x20 [tun] [ 52.765834] [] do_vfs_ioctl+0x96/0x690 [ 52.765843] [] ? security_file_ioctl+0x43/0x60 [ 52.765850] [] SyS_ioctl+0x79/0x90 [ 52.765858] [] do_syscall_64+0x62/0x140 [ 52.765866] [] entry_SYSCALL64_slow_path+0x25/0x25 Same can be triggered with PROVE_RCU (+ PROVE_RCU_REPEATEDLY) enabled from tun_attach_filter() when user space calls ioctl(tun_fd, TUN{ATTACH, DETACH}FILTER, ...) for adding/removing a BPF filter on tap devices. Since the fix in f91ff5b9ff52 ("net: sk_{detach|attach}_filter() rcu fixes") sk_attach_filter()/sk_detach_filter() now dereferences the filter with rcu_dereference_protected(), checking whether socket lock is held in control path. Since its introduction in 994051625981 ("tun: socket filter support"), tap filters are managed under RTNL lock from __tun_chr_ioctl(). Thus the sock_owned_by_user(sk) doesn't apply in this specific case and therefore triggers the false positive. Extend the BPF API with __sk_attach_filter()/__sk_detach_filter() pair that is used by tap filters and pass in lockdep_rtnl_is_held() for the rcu_dereference_protected() checks instead. Reported-by: Sasha Levin Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 4b81b71171b4..ca7f832b2980 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1149,7 +1149,8 @@ void bpf_prog_destroy(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_destroy); -static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) +static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk, + bool locked) { struct sk_filter *fp, *old_fp; @@ -1165,10 +1166,8 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return -ENOMEM; } - old_fp = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); + old_fp = rcu_dereference_protected(sk->sk_filter, locked); rcu_assign_pointer(sk->sk_filter, fp); - if (old_fp) sk_filter_uncharge(sk, old_fp); @@ -1247,7 +1246,8 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) * occurs or there is insufficient memory for the filter a negative * errno code is returned. On success the return is zero. */ -int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, + bool locked) { struct bpf_prog *prog = __get_filter(fprog, sk); int err; @@ -1255,7 +1255,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __sk_attach_prog(prog, sk); + err = __sk_attach_prog(prog, sk, locked); if (err < 0) { __bpf_prog_release(prog); return err; @@ -1263,7 +1263,12 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) return 0; } -EXPORT_SYMBOL_GPL(sk_attach_filter); +EXPORT_SYMBOL_GPL(__sk_attach_filter); + +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk)); +} int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { @@ -1309,7 +1314,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __sk_attach_prog(prog, sk); + err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk)); if (err < 0) { bpf_prog_put(prog); return err; @@ -2250,7 +2255,7 @@ static int __init register_sk_filter_ops(void) } late_initcall(register_sk_filter_ops); -int sk_detach_filter(struct sock *sk) +int __sk_detach_filter(struct sock *sk, bool locked) { int ret = -ENOENT; struct sk_filter *filter; @@ -2258,8 +2263,7 @@ int sk_detach_filter(struct sock *sk) if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; - filter = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); + filter = rcu_dereference_protected(sk->sk_filter, locked); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); @@ -2268,7 +2272,12 @@ int sk_detach_filter(struct sock *sk) return ret; } -EXPORT_SYMBOL_GPL(sk_detach_filter); +EXPORT_SYMBOL_GPL(__sk_detach_filter); + +int sk_detach_filter(struct sock *sk) +{ + return __sk_detach_filter(sk, sock_owned_by_user(sk)); +} int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) -- cgit From 727ceaa49bb86518470c19640ed7f067c5aa9485 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 5 Apr 2016 15:58:22 -0500 Subject: Revert "netpoll: Fix extra refcount release in netpoll_cleanup()" This reverts commit 543e3a8da5a4c453e992d5351ef405d5e32f27d7. Direct callers of __netpoll_setup() depend on it to set np->dev, so we can't simply move that assignment up to netpoll_stup(). Reported-by: Bart Van Assche Signed-off-by: Bjorn Helgaas Signed-off-by: David S. Miller --- net/core/netpoll.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a57bd17805b4..94acfc89ad97 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -603,6 +603,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) const struct net_device_ops *ops; int err; + np->dev = ndev; strlcpy(np->dev_name, ndev->name, IFNAMSIZ); INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); @@ -669,7 +670,6 @@ int netpoll_setup(struct netpoll *np) goto unlock; } dev_hold(ndev); - np->dev = ndev; if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); @@ -770,7 +770,6 @@ int netpoll_setup(struct netpoll *np) return 0; put: - np->dev = NULL; dev_put(ndev); unlock: rtnl_unlock(); -- cgit From 0a1a37b6d62e6864a77a82e925217c720f91f963 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 5 Apr 2016 07:41:11 -0700 Subject: net: add the AF_KCM entries to family name tables This is for the recent kcm driver, which introduces AF_KCM(41) in b7ac4eb(kcm: Kernel Connection Multiplexor module). Signed-off-by: Dexuan Cui Cc: Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/sock.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index b67b9aedb230..7e73c26b6bb4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -221,7 +221,8 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , - "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX" + "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , + "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -237,7 +238,8 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , - "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" + "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , + "slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -253,7 +255,8 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , - "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX" + "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , + "clock-AF_MAX" }; /* -- cgit From a0ca153f98db8cf25298565a09e11fe9d82846ad Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 5 Apr 2016 09:13:39 -0700 Subject: GRE: Disable segmentation offloads w/ CSUM and we are encapsulated via FOU This patch fixes an issue I found in which we were dropping frames if we had enabled checksums on GRE headers that were encapsulated by either FOU or GUE. Without this patch I was barely able to get 1 Gb/s of throughput. With this patch applied I am now at least getting around 6 Gb/s. The issue is due to the fact that with FOU or GUE applied we do not provide a transport offset pointing to the GRE header, nor do we offload it in software as the GRE header is completely skipped by GSO and treated like a VXLAN or GENEVE type header. As such we need to prevent the stack from generating it and also prevent GRE from generating it via any interface we create. Fixes: c3483384ee511 ("gro: Allow tunnel stacking in the case of FOU/GUE") Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b9bcbe77d913..77a71cd68535 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4439,6 +4439,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->encap_mark = 0; + NAPI_GRO_CB(skb)->is_fou = 0; NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ -- cgit From 9241e2df4fbc648a92ea0752918e05c26255649e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 16 Apr 2016 02:27:58 +0200 Subject: vlan: pull on __vlan_insert_tag error path and fix csum correction When __vlan_insert_tag() fails from skb_vlan_push() path due to the skb_cow_head(), we need to undo the __skb_push() in the error path as well that was done earlier to move skb->data pointer to mac header. Moreover, I noticed that when in the non-error path the __skb_pull() is done and the original offset to mac header was non-zero, we fixup from a wrong skb->data offset in the checksum complete processing. So the skb_postpush_rcsum() really needs to be done before __skb_pull() where skb->data still points to the mac header start and thus operates under the same conditions as in __vlan_insert_tag(). Fixes: 93515d53b133 ("net: move vlan pop/push functions into common code") Signed-off-by: Daniel Borkmann Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d04c2d1c8c87..e561f9f07d6d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4502,13 +4502,16 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) __skb_push(skb, offset); err = __vlan_insert_tag(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); - if (err) + if (err) { + __skb_pull(skb, offset); return err; + } + skb->protocol = skb->vlan_proto; skb->mac_len += VLAN_HLEN; - __skb_pull(skb, offset); skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + __skb_pull(skb, offset); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; -- cgit