From c5d4d7d83165ae863954b113c7f403d8b58febed Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 4 Sep 2017 10:28:02 +0200 Subject: xfrm: Fix deletion of offloaded SAs on failure. When we off load a SA, it gets pushed to the NIC before we can add it. In case of a failure, we don't delete this SA from the NIC. Fix this by calling xfrm_dev_state_delete on failure. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Reported-by: Shannon Nelson Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2bfbd9121e3b..b997f1395357 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -657,6 +657,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) { x->km.state = XFRM_STATE_DEAD; + xfrm_dev_state_delete(x); __xfrm_state_put(x); goto out; } -- cgit From 67a63387b1417b5954eedb15f638f1f0bee3da49 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 4 Sep 2017 10:59:55 +0200 Subject: xfrm: Fix negative device refcount on offload failure. Reset the offload device at the xfrm_state if the device was not able to offload the state. Otherwise we drop the device refcount twice. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Reported-by: Shannon Nelson Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index acf00104ef31..30e5746085b8 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -91,6 +91,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { + xso->dev = NULL; dev_put(dev); return 0; } -- cgit From 23e9fcfef1f3d10675acce023592796851bcaf1a Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Tue, 12 Sep 2017 14:53:46 +0300 Subject: vti: fix NULL dereference in xfrm_input() Can be reproduced with LTP tests: # icmp-uni-vti.sh -p ah -a sha256 -m tunnel -S fffffffe -k 1 -s 10 IPv4: RIP: 0010:xfrm_input+0x7f9/0x870 ... Call Trace: vti_input+0xaa/0x110 [ip_vti] ? skb_free_head+0x21/0x40 vti_rcv+0x33/0x40 [ip_vti] xfrm4_ah_rcv+0x33/0x60 ip_local_deliver_finish+0x94/0x1e0 ip_local_deliver+0x6f/0xe0 ? ip_route_input_noref+0x28/0x50 ... # icmp-uni-vti.sh -6 -p ah -a sha256 -m tunnel -S fffffffe -k 1 -s 10 IPv6: RIP: 0010:xfrm_input+0x7f9/0x870 ... Call Trace: xfrm6_rcv_tnl+0x3c/0x40 vti6_rcv+0xd5/0xe0 [ip6_vti] xfrm6_ah_rcv+0x33/0x60 ip6_input_finish+0xee/0x460 ip6_input+0x3f/0xb0 ip6_rcv_finish+0x45/0xa0 ipv6_rcv+0x34b/0x540 xfrm_input() invokes xfrm_rcv_cb() -> vti_rcv_cb(), the last callback might call skb_scrub_packet(), which in turn can reset secpath. Fix it by adding a check that skb->sp is not NULL. Fixes: 7e9e9202bccc ("xfrm: Clear RX SKB secpath xfrm_offload") Signed-off-by: Alexey Kodanev Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 2515cd2bc5db..8ac9d32fb79d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -429,7 +429,8 @@ resume: nf_reset(skb); if (decaps) { - skb->sp->olen = 0; + if (skb->sp) + skb->sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; @@ -440,7 +441,8 @@ resume: err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); if (xfrm_gro) { - skb->sp->olen = 0; + if (skb->sp) + skb->sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; -- cgit From cdd10c9627496ad25c87ce6394e29752253c69d3 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 22 Sep 2017 15:39:23 +0200 Subject: l2tp: ensure sessions are freed after their PPPOL2TP socket If l2tp_tunnel_delete() or l2tp_tunnel_closeall() deletes a session right after pppol2tp_release() orphaned its socket, then the 'sock' variable of the pppol2tp_session_close() callback is NULL. Yet the session is still used by pppol2tp_release(). Therefore we need to take an extra reference in any case, to prevent l2tp_tunnel_delete() or l2tp_tunnel_closeall() from freeing the session. Since the pppol2tp_session_close() callback is only set if the session is associated to a PPPOL2TP socket and that both l2tp_tunnel_delete() and l2tp_tunnel_closeall() hold the PPPOL2TP socket before calling pppol2tp_session_close(), we're sure that pppol2tp_session_close() and pppol2tp_session_destruct() are paired and called in the right order. So the reference taken by the former will be released by the later. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 50e3ee9a9d61..bc6e8bfc5be4 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -437,11 +437,11 @@ static void pppol2tp_session_close(struct l2tp_session *session) BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (sock) { + if (sock) inet_shutdown(sock, SEND_SHUTDOWN); - /* Don't let the session go away before our socket does */ - l2tp_session_inc_refcount(session); - } + + /* Don't let the session go away before our socket does */ + l2tp_session_inc_refcount(session); } /* Really kill the session socket. (Called from sock_put() if -- cgit From b228a94066406b6c456321d69643b0d7ce11cfa6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 22 Sep 2017 15:39:24 +0200 Subject: l2tp: fix race between l2tp_session_delete() and l2tp_tunnel_closeall() There are several ways to remove L2TP sessions: * deleting a session explicitly using the netlink interface (with L2TP_CMD_SESSION_DELETE), * deleting the session's parent tunnel (either by closing the tunnel's file descriptor or using the netlink interface), * closing the PPPOL2TP file descriptor of a PPP pseudo-wire. In some cases, when these methods are used concurrently on the same session, the session can be removed twice, leading to use-after-free bugs. This patch adds a 'dead' flag, used by l2tp_session_delete() and l2tp_tunnel_closeall() to prevent them from stepping on each other's toes. The session deletion path used when closing a PPPOL2TP file descriptor doesn't need to be adapted. It already has to ensure that a session remains valid for the lifetime of its PPPOL2TP file descriptor. So it takes an extra reference on the session in the ->session_close() callback (pppol2tp_session_close()), which is eventually dropped in the ->sk_destruct() callback of the PPPOL2TP socket (pppol2tp_session_destruct()). Still, __l2tp_session_unhash() and l2tp_session_queue_purge() can be called twice and even concurrently for a given session, but thanks to proper locking and re-initialisation of list fields, this is not an issue. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 6 ++++++ net/l2tp/l2tp_core.h | 1 + 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ee485df73ccd..d8c2a89a76e1 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1314,6 +1314,9 @@ again: hlist_del_init(&session->hlist); + if (test_and_set_bit(0, &session->dead)) + goto again; + if (session->ref != NULL) (*session->ref)(session); @@ -1750,6 +1753,9 @@ EXPORT_SYMBOL_GPL(__l2tp_session_unhash); */ int l2tp_session_delete(struct l2tp_session *session) { + if (test_and_set_bit(0, &session->dead)) + return 0; + if (session->ref) (*session->ref)(session); __l2tp_session_unhash(session); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a305e0c5925a..70a12df40a5f 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -76,6 +76,7 @@ struct l2tp_session_cfg { struct l2tp_session { int magic; /* should be * L2TP_SESSION_MAGIC */ + long dead; struct l2tp_tunnel *tunnel; /* back pointer to tunnel * context */ -- cgit From b621129f4f08c8d42ac4de2e77a07c5cf0c4b740 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 15 Feb 2017 16:33:56 +0300 Subject: netfilter: ipvs: full-functionality option for ECN encapsulation in tunnel IPVS tunnel mode works as simple tunnel (see RFC 3168) copying ECN field to outer header. That's result in packet drops on egress tunnels in case the egress tunnel operates as ECN-capable with Full-functionality option (like ip_tunnel and ip6_tunnel kernel modules), according to RFC 3168 section 9.1.1 recommendation. This patch implements ECN full-functionality option into ipvs xmit code. Cc: netdev@vger.kernel.org Cc: lvs-devel@vger.kernel.org Signed-off-by: Vadim Fedorenko Reviewed-by: Konstantin Khlebnikov Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_xmit.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 90d396814798..4527921b1c3a 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -921,6 +921,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, { struct sk_buff *new_skb = NULL; struct iphdr *old_iph = NULL; + __u8 old_dsfield; #ifdef CONFIG_IP_VS_IPV6 struct ipv6hdr *old_ipv6h = NULL; #endif @@ -945,7 +946,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, *payload_len = ntohs(old_ipv6h->payload_len) + sizeof(*old_ipv6h); - *dsfield = ipv6_get_dsfield(old_ipv6h); + old_dsfield = ipv6_get_dsfield(old_ipv6h); *ttl = old_ipv6h->hop_limit; if (df) *df = 0; @@ -960,12 +961,15 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, /* fix old IP header checksum */ ip_send_check(old_iph); - *dsfield = ipv4_get_dsfield(old_iph); + old_dsfield = ipv4_get_dsfield(old_iph); *ttl = old_iph->ttl; if (payload_len) *payload_len = ntohs(old_iph->tot_len); } + /* Implement full-functionality option for ECN encapsulation */ + *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); + return skb; error: kfree_skb(skb); -- cgit From 36f6ee22d2d66046e369757ec6bbe1c482957ba6 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Tue, 26 Sep 2017 15:14:29 +0300 Subject: vti: fix use after free in vti_tunnel_xmit/vti6_tnl_xmit When running LTP IPsec tests, KASan might report: BUG: KASAN: use-after-free in vti_tunnel_xmit+0xeee/0xff0 [ip_vti] Read of size 4 at addr ffff880dc6ad1980 by task swapper/0/0 ... Call Trace: dump_stack+0x63/0x89 print_address_description+0x7c/0x290 kasan_report+0x28d/0x370 ? vti_tunnel_xmit+0xeee/0xff0 [ip_vti] __asan_report_load4_noabort+0x19/0x20 vti_tunnel_xmit+0xeee/0xff0 [ip_vti] ? vti_init_net+0x190/0x190 [ip_vti] ? save_stack_trace+0x1b/0x20 ? save_stack+0x46/0xd0 dev_hard_start_xmit+0x147/0x510 ? icmp_echo.part.24+0x1f0/0x210 __dev_queue_xmit+0x1394/0x1c60 ... Freed by task 0: save_stack_trace+0x1b/0x20 save_stack+0x46/0xd0 kasan_slab_free+0x70/0xc0 kmem_cache_free+0x81/0x1e0 kfree_skbmem+0xb1/0xe0 kfree_skb+0x75/0x170 kfree_skb_list+0x3e/0x60 __dev_queue_xmit+0x1298/0x1c60 dev_queue_xmit+0x10/0x20 neigh_resolve_output+0x3a8/0x740 ip_finish_output2+0x5c0/0xe70 ip_finish_output+0x4ba/0x680 ip_output+0x1c1/0x3a0 xfrm_output_resume+0xc65/0x13d0 xfrm_output+0x1e4/0x380 xfrm4_output_finish+0x5c/0x70 Can be fixed if we get skb->len before dst_output(). Fixes: b9959fd3b0fa ("vti: switch to new ip tunnel code") Fixes: 22e1b23dafa8 ("vti6: Support inter address family tunneling.") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 3 ++- net/ipv6/ip6_vti.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5ed63d250950..89453cf62158 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_parm *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ + int pkt_len = skb->len; int err; int mtu; @@ -229,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) - err = skb->len; + err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 79444a4bfd6d..bcdc2d557de1 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -445,6 +445,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; + int pkt_len = skb->len; int err = -1; int mtu; @@ -502,7 +503,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += skb->len; + tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); } else { -- cgit From 62b982eeb4589b2e6d7c01a90590e3a4c2b2ca19 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 26 Sep 2017 16:16:43 +0200 Subject: l2tp: fix race condition in l2tp_tunnel_delete If we try to delete the same tunnel twice, the first delete operation does a lookup (l2tp_tunnel_get), finds the tunnel, calls l2tp_tunnel_delete, which queues it for deletion by l2tp_tunnel_del_work. The second delete operation also finds the tunnel and calls l2tp_tunnel_delete. If the workqueue has already fired and started running l2tp_tunnel_del_work, then l2tp_tunnel_delete will queue the same tunnel a second time, and try to free the socket again. Add a dead flag to prevent firing the workqueue twice. Then we can remove the check of queue_work's result that was meant to prevent that race but doesn't. Reproducer: ip l2tp add tunnel tunnel_id 3000 peer_tunnel_id 4000 local 192.168.0.2 remote 192.168.0.1 encap udp udp_sport 5000 udp_dport 6000 ip l2tp add session name l2tp1 tunnel_id 3000 session_id 1000 peer_session_id 2000 ip link set l2tp1 up ip l2tp del tunnel tunnel_id 3000 ip l2tp del tunnel tunnel_id 3000 Fixes: f8ccac0e4493 ("l2tp: put tunnel socket release on a workqueue") Reported-by: Jianlin Shi Signed-off-by: Sabrina Dubroca Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 10 ++++------ net/l2tp/l2tp_core.h | 5 ++++- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index d8c2a89a76e1..02d61101b108 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1688,14 +1688,12 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); /* This function is used by the netlink TUNNEL_DELETE command. */ -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { - l2tp_tunnel_inc_refcount(tunnel); - if (false == queue_work(l2tp_wq, &tunnel->del_work)) { - l2tp_tunnel_dec_refcount(tunnel); - return 1; + if (!test_and_set_bit(0, &tunnel->dead)) { + l2tp_tunnel_inc_refcount(tunnel); + queue_work(l2tp_wq, &tunnel->del_work); } - return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 70a12df40a5f..67c79d9b5c6c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -161,6 +161,9 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + + unsigned long dead; + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ bool acpt_newsess; /* Indicates whether this @@ -255,7 +258,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, -- cgit From 89fcbb564f4a64c439d597c2702f990eed49c8a1 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 21 Sep 2017 19:01:36 -0600 Subject: netfilter: xt_socket: Restore mark from full sockets only An out of bounds error was detected on an ARM64 target with Android based kernel 4.9. This occurs while trying to restore mark on a skb from an inet request socket. BUG: KASAN: slab-out-of-bounds in socket_match.isra.2+0xc8/0x1f0 net/netfilter/xt_socket.c:248 Read of size 4 at addr ffffffc06a8d824c by task syz-fuzzer/1532 CPU: 7 PID: 1532 Comm: syz-fuzzer Tainted: G W O 4.9.41+ #1 Call trace: [] dump_backtrace+0x0/0x440 arch/arm64/kernel/traps.c:76 [] show_stack+0x28/0x38 arch/arm64/kernel/traps.c:226 [] __dump_stack lib/dump_stack.c:15 [inline] [] dump_stack+0xe4/0x134 lib/dump_stack.c:51 [] print_address_description+0x68/0x258 mm/kasan/report.c:248 [] kasan_report_error mm/kasan/report.c:347 [inline] [] kasan_report.part.2+0x228/0x2f0 mm/kasan/report.c:371 [] kasan_report+0x5c/0x70 mm/kasan/report.c:372 [] check_memory_region_inline mm/kasan/kasan.c:308 [inline] [] __asan_load4+0x88/0xa0 mm/kasan/kasan.c:740 [] socket_match.isra.2+0xc8/0x1f0 net/netfilter/xt_socket.c:248 [] socket_mt4_v1_v2_v3+0x3c/0x48 net/netfilter/xt_socket.c:272 [] ipt_do_table+0x54c/0xad8 net/ipv4/netfilter/ip_tables.c:311 [] iptable_mangle_hook+0x6c/0x220 net/ipv4/netfilter/iptable_mangle.c:90 ... Allocated by task 1532: save_stack_trace_tsk+0x0/0x2a0 arch/arm64/kernel/stacktrace.c:131 save_stack_trace+0x28/0x38 arch/arm64/kernel/stacktrace.c:215 save_stack mm/kasan/kasan.c:495 [inline] set_track mm/kasan/kasan.c:507 [inline] kasan_kmalloc+0xd8/0x188 mm/kasan/kasan.c:599 kasan_slab_alloc+0x14/0x20 mm/kasan/kasan.c:537 slab_post_alloc_hook mm/slab.h:417 [inline] slab_alloc_node mm/slub.c:2728 [inline] slab_alloc mm/slub.c:2736 [inline] kmem_cache_alloc+0x14c/0x2e8 mm/slub.c:2741 reqsk_alloc include/net/request_sock.h:87 [inline] inet_reqsk_alloc+0x4c/0x238 net/ipv4/tcp_input.c:6236 tcp_conn_request+0x2b0/0xea8 net/ipv4/tcp_input.c:6341 tcp_v4_conn_request+0xe0/0x100 net/ipv4/tcp_ipv4.c:1256 tcp_rcv_state_process+0x384/0x18a8 net/ipv4/tcp_input.c:5926 tcp_v4_do_rcv+0x2f0/0x3e0 net/ipv4/tcp_ipv4.c:1430 tcp_v4_rcv+0x1278/0x1350 net/ipv4/tcp_ipv4.c:1709 ip_local_deliver_finish+0x174/0x3e0 net/ipv4/ip_input.c:216 v1->v2: Change socket_mt6_v1_v2_v3() as well as mentioned by Eric v2->v3: Put the correct fixes tag Fixes: 01555e74bde5 ("netfilter: xt_socket: add XT_SOCKET_RESTORESKMARK flag") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index e75ef39669c5..575d2153e3b8 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -76,7 +76,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, transparent = nf_sk_is_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && - transparent) + transparent && sk_fullsock(sk)) pskb->mark = sk->sk_mark; if (sk != skb->sk) @@ -133,7 +133,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) transparent = nf_sk_is_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && - transparent) + transparent && sk_fullsock(sk)) pskb->mark = sk->sk_mark; if (sk != skb->sk) -- cgit From 48596a8ddc46f96afb6a2cd72787cb15d6bb01fc Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 23 Sep 2017 23:37:40 +0200 Subject: netfilter: ipset: Fix adding an IPv4 range containing more than 2^31 addresses Wrong comparison prevented the hash types to add a range with more than 2^31 addresses but reported as a success. Fixes Netfilter's bugzilla id #1005, reported by Oleg Serditov and Oliver Ford. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_ip.c | 22 ++++++++++++---------- net/netfilter/ipset/ip_set_hash_ipmark.c | 2 +- net/netfilter/ipset/ip_set_hash_ipport.c | 2 +- net/netfilter/ipset/ip_set_hash_ipportip.c | 2 +- net/netfilter/ipset/ip_set_hash_ipportnet.c | 4 ++-- net/netfilter/ipset/ip_set_hash_net.c | 2 +- net/netfilter/ipset/ip_set_hash_netiface.c | 2 +- net/netfilter/ipset/ip_set_hash_netnet.c | 4 ++-- net/netfilter/ipset/ip_set_hash_netport.c | 2 +- net/netfilter/ipset/ip_set_hash_netportnet.c | 4 ++-- 10 files changed, 24 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 20bfbd315f61..613eb212cb48 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -123,13 +123,12 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; ip &= ip_set_hostmask(h->netmask); + e.ip = htonl(ip); + if (e.ip == 0) + return -IPSET_ERR_HASH_ELEM; - if (adt == IPSET_TEST) { - e.ip = htonl(ip); - if (e.ip == 0) - return -IPSET_ERR_HASH_ELEM; + if (adt == IPSET_TEST) return adtfn(set, &e, &ext, &ext, flags); - } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { @@ -148,17 +147,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); - if (retried) + if (retried) { ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip += hosts) { e.ip = htonl(ip); - if (e.ip == 0) - return -IPSET_ERR_HASH_ELEM; + } + for (; ip <= ip_to;) { ret = adtfn(set, &e, &ext, &ext, flags); - if (ret && !ip_set_eexist(ret, flags)) return ret; + ip += hosts; + e.ip = htonl(ip); + if (e.ip == 0) + return 0; + ret = 0; } return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index b64cf14e8352..f3ba8348cf9d 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -149,7 +149,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { e.ip = htonl(ip); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index f438740e6c6a..ddb8039ec1d2 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -178,7 +178,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 6215fb898c50..a7f4d7a85420 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -185,7 +185,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 5ab1b99a53c2..a2f19b9906e9 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -271,7 +271,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { e.ip = htonl(ip); p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; @@ -281,7 +281,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip == ntohl(h->next.ip) && p == ntohs(h->next.port) ? ntohl(h->next.ip2) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip2 = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, &cidr); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 5d9e895452e7..1c67a1761e45 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -193,7 +193,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], } if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 44cf11939c91..d417074f1c1a 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -255,7 +255,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index db614e13b193..7f9ae2e9645b 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -250,13 +250,13 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip[0]); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip[0] = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); ip2 = (retried && ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip[1] = htonl(ip2); last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 54b64b6cd0cd..e6ef382febe4 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -241,7 +241,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index aff846960ac4..8602f2595a1a 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -291,7 +291,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip[0]); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip[0] = htonl(ip); ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) @@ -301,7 +301,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip2 = (retried && ip == ntohl(h->next.ip[0]) && p == ntohs(h->next.port)) ? ntohl(h->next.ip[1]) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip[1] = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); -- cgit From e23ed762db7ed1950a6408c3be80bc56909ab3d4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 11:57:54 +0200 Subject: netfilter: ipset: pernet ops must be unregistered last Removing the ipset module leaves a small window where one cpu performs module removal while another runs a command like 'ipset flush'. ipset uses net_generic(), unregistering the pernet ops frees this storage area. Fix it by first removing the user-visible api handlers and the pernet ops last. Fixes: 1785e8f473082 ("netfiler: ipset: Add net namespace for ipset") Reported-by: Li Shuang Signed-off-by: Florian Westphal Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e495b5e484b1..a7f049ff3049 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -2072,25 +2072,28 @@ static struct pernet_operations ip_set_net_ops = { static int __init ip_set_init(void) { - int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + int ret = register_pernet_subsys(&ip_set_net_ops); + if (ret) { + pr_err("ip_set: cannot register pernet_subsys.\n"); + return ret; + } + + ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); + unregister_pernet_subsys(&ip_set_net_ops); return ret; } + ret = nf_register_sockopt(&so_set); if (ret != 0) { pr_err("SO_SET registry failed: %d\n", ret); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + unregister_pernet_subsys(&ip_set_net_ops); return ret; } - ret = register_pernet_subsys(&ip_set_net_ops); - if (ret) { - pr_err("ip_set: cannot register pernet_subsys.\n"); - nf_unregister_sockopt(&so_set); - nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - return ret; - } + pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } @@ -2098,9 +2101,10 @@ ip_set_init(void) static void __exit ip_set_fini(void) { - unregister_pernet_subsys(&ip_set_net_ops); nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + + unregister_pernet_subsys(&ip_set_net_ops); pr_debug("these are the famous last words\n"); } -- cgit From c2cc187e53011c1c4931055984657da9085c763b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 25 Sep 2017 13:19:26 +0300 Subject: sctp: Fix a big endian bug in sctp_diag_dump() The sctp_for_each_transport() function takes an pointer to int. The cb->args[] array holds longs so it's only using the high 32 bits. It works on little endian system but will break on big endian 64 bit machines. Fixes: d25adbeb0cdb ("sctp: fix an use-after-free issue in sctp_sock_dump") Signed-off-by: Dan Carpenter Acked-by: Neil Horman Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sctp_diag.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index 22ed01a76b19..a72a7d925d46 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -463,6 +463,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, .r = r, .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; + int pos = cb->args[2]; /* eps hashtable dumps * args: @@ -493,7 +494,8 @@ skip: goto done; sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, - net, (int *)&cb->args[2], &commp); + net, &pos, &commp); + cb->args[2] = pos; done: cb->args[1] = cb->args[4]; -- cgit From dd269db84908d4d3f7c0efed85bf9d8939fb0b9b Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 27 Sep 2017 14:25:37 +0200 Subject: xfrm: don't call xfrm_policy_cache_flush under xfrm_state_lock I might be wrong but it doesn't look like xfrm_state_lock is required for xfrm_policy_cache_flush and calling it under this lock triggers both "sleeping function called from invalid context" and "possible circular locking dependency detected" warnings on flush. Fixes: ec30d78c14a8 xfrm: add xdst pcpu cache Signed-off-by: Artem Savkov Acked-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 0dab1cd79ce4..12213477cd3a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -732,12 +732,12 @@ restart: } } } +out: + spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (cnt) { err = 0; xfrm_policy_cache_flush(); } -out: - spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; } EXPORT_SYMBOL(xfrm_state_flush); -- cgit From 35f493b87ec072c5a2497ffbee243095ef725827 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Sep 2017 08:40:02 -0700 Subject: inetpeer: fix RCU lookup() again My prior fix was not complete, as we were dereferencing a pointer three times per node, not twice as I initially thought. Fixes: 4cc5b44b29a9 ("inetpeer: fix RCU lookup()") Fixes: b145425f269a ("inetpeer: remove AVL implementation in favor of RB tree") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e7eb590c86ce..b20c8ac64081 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -128,9 +128,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, break; } if (cmp == -1) - pp = &(*pp)->rb_left; + pp = &next->rb_left; else - pp = &(*pp)->rb_right; + pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; -- cgit From e804441cfe0b60f6c430901946a69c01eac09df1 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 25 Sep 2017 15:55:53 -0700 Subject: net: dsa: Fix network device registration order We cannot be registering the network device first, then setting its carrier off and finally connecting it to a PHY, doing that leaves a window during which the carrier is at best inconsistent, and at worse the device is not usable without a down/up sequence since the network device is visible to user space with possibly no PHY device attached. Re-order steps so that they make logical sense. This fixes some devices where the port was not usable after e.g: an unbind then bind of the driver. Fixes: 0071f56e46da ("dsa: Register netdev before phy") Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2afa99506f8b..865e29e62bad 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1301,28 +1301,33 @@ int dsa_slave_create(struct dsa_port *port, const char *name) p->old_duplex = -1; port->netdev = slave_dev; - ret = register_netdev(slave_dev); - if (ret) { - netdev_err(master, "error %d registering interface %s\n", - ret, slave_dev->name); - port->netdev = NULL; - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; - } netif_carrier_off(slave_dev); ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - unregister_netdev(slave_dev); - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; + goto out_free; + } + + ret = register_netdev(slave_dev); + if (ret) { + netdev_err(master, "error %d registering interface %s\n", + ret, slave_dev->name); + goto out_phy; } return 0; + +out_phy: + phy_disconnect(p->phy); + if (of_phy_is_fixed_link(p->dp->dn)) + of_phy_deregister_fixed_link(p->dp->dn); +out_free: + free_percpu(p->stats64); + free_netdev(slave_dev); + port->netdev = NULL; + return ret; } void dsa_slave_destroy(struct net_device *slave_dev) -- cgit From 4971613c1639d8e5f102c4e797c3bf8f83a5a69e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 26 Sep 2017 12:19:37 -0400 Subject: packet: in packet_do_bind, test fanout with bind_lock held Once a socket has po->fanout set, it remains a member of the group until it is destroyed. The prot_hook must be constant and identical across sockets in the group. If fanout_add races with packet_do_bind between the test of po->fanout and taking the lock, the bind call may make type or dev inconsistent with that of the fanout group. Hold po->bind_lock when testing po->fanout to avoid this race. I had to introduce artificial delay (local_bh_enable) to actually observe the race. Fixes: dc99f600698d ("packet: Add fanout support.") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d288f52c53f7..a10c2836465c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3069,13 +3069,15 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, int ret = 0; bool unlisted = false; - if (po->fanout) - return -EINVAL; - lock_sock(sk); spin_lock(&po->bind_lock); rcu_read_lock(); + if (po->fanout) { + ret = -EINVAL; + goto out_unlock; + } + if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { -- cgit From da7c9561015e93d10fe6aab73e9288e0d09d65a6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 26 Sep 2017 12:20:17 -0400 Subject: packet: only test po->has_vnet_hdr once in packet_snd Packet socket option po->has_vnet_hdr can be updated concurrently with other operations if no ring is attached. Do not test the option twice in packet_snd, as the value may change in between calls. A race on setsockopt disable may cause a packet > mtu to be sent without having GSO options set. Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a10c2836465c..bec01a3daf5b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2840,6 +2840,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); + bool has_vnet_hdr = false; int hlen, tlen, linear; int extra_len = 0; @@ -2883,6 +2884,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); if (err) goto out_unlock; + has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2941,7 +2943,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->priority = sk->sk_priority; skb->mark = sockc.mark; - if (po->has_vnet_hdr) { + if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; -- cgit From 9d538fa60bad4f7b23193c89e843797a1cf71ef3 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 26 Sep 2017 17:38:50 -0700 Subject: net: Set sk_prot_creator when cloning sockets to the right proto sk->sk_prot and sk->sk_prot_creator can differ when the app uses IPV6_ADDRFORM (transforming an IPv6-socket to an IPv4-one). Which is why sk_prot_creator is there to make sure that sk_prot_free() does the kmem_cache_free() on the right kmem_cache slab. Now, if such a socket gets transformed back to a listening socket (using connect() with AF_UNSPEC) we will allocate an IPv4 tcp_sock through sk_clone_lock() when a new connection comes in. But sk_prot_creator will still point to the IPv6 kmem_cache (as everything got copied in sk_clone_lock()). When freeing, we will thus put this memory back into the IPv6 kmem_cache although it was allocated in the IPv4 cache. I have seen memory corruption happening because of this. With slub-debugging and MEMCG_KMEM enabled this gives the warning "cache_from_obj: Wrong slab cache. TCPv6 but object is from TCP" A C-program to trigger this: void main(void) { int fd = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); int new_fd, newest_fd, client_fd; struct sockaddr_in6 bind_addr; struct sockaddr_in bind_addr4, client_addr1, client_addr2; struct sockaddr unsp; int val; memset(&bind_addr, 0, sizeof(bind_addr)); bind_addr.sin6_family = AF_INET6; bind_addr.sin6_port = ntohs(42424); memset(&client_addr1, 0, sizeof(client_addr1)); client_addr1.sin_family = AF_INET; client_addr1.sin_port = ntohs(42424); client_addr1.sin_addr.s_addr = inet_addr("127.0.0.1"); memset(&client_addr2, 0, sizeof(client_addr2)); client_addr2.sin_family = AF_INET; client_addr2.sin_port = ntohs(42421); client_addr2.sin_addr.s_addr = inet_addr("127.0.0.1"); memset(&unsp, 0, sizeof(unsp)); unsp.sa_family = AF_UNSPEC; bind(fd, (struct sockaddr *)&bind_addr, sizeof(bind_addr)); listen(fd, 5); client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); connect(client_fd, (struct sockaddr *)&client_addr1, sizeof(client_addr1)); new_fd = accept(fd, NULL, NULL); close(fd); val = AF_INET; setsockopt(new_fd, SOL_IPV6, IPV6_ADDRFORM, &val, sizeof(val)); connect(new_fd, &unsp, sizeof(unsp)); memset(&bind_addr4, 0, sizeof(bind_addr4)); bind_addr4.sin_family = AF_INET; bind_addr4.sin_port = ntohs(42421); bind(new_fd, (struct sockaddr *)&bind_addr4, sizeof(bind_addr4)); listen(new_fd, 5); client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); connect(client_fd, (struct sockaddr *)&client_addr2, sizeof(client_addr2)); newest_fd = accept(new_fd, NULL, NULL); close(new_fd); close(client_fd); close(new_fd); } As far as I can see, this bug has been there since the beginning of the git-days. Signed-off-by: Christoph Paasch Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 9b7b6bbb2a23..7d55c05f449d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1654,6 +1654,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); + newsk->sk_prot_creator = sk->sk_prot; + /* SANITY */ if (likely(newsk->sk_net_refcnt)) get_net(sock_net(newsk)); -- cgit From e5173418ac597cebe9f7a39adf10be470000b518 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Wed, 27 Sep 2017 10:06:27 +0100 Subject: netfilter: ipset: Fix race between dump and swap Fix a race between ip_set_dump_start() and ip_set_swap(). The race is as follows: * Without holding the ref lock, ip_set_swap() checks ref_netlink of the set and it is 0. * ip_set_dump_start() takes a reference on the set. * ip_set_swap() does the swap (even though it now has a non-zero reference count). * ip_set_dump_start() gets the set from ip_set_list again which is now a different set since it has been swapped. * ip_set_dump_start() calls __ip_set_put_netlink() and hits a BUG_ON due to the reference count being 0. Fix this race by extending the critical region in which the ref lock is held to include checking the ref counts. The race can be reproduced with the following script: while :; do ipset destroy hash_ip1 ipset destroy hash_ip2 ipset create hash_ip1 hash:ip family inet hashsize 1024 \ maxelem 500000 ipset create hash_ip2 hash:ip family inet hashsize 300000 \ maxelem 500000 ipset create hash_ip3 hash:ip family inet hashsize 1024 \ maxelem 500000 ipset save & ipset swap hash_ip3 hash_ip2 ipset destroy hash_ip3 wait done Signed-off-by: Ross Lagerwall Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index a7f049ff3049..cf84f7b37cd9 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1191,14 +1191,17 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; - if (from->ref_netlink || to->ref_netlink) + write_lock_bh(&ip_set_ref_lock); + + if (from->ref_netlink || to->ref_netlink) { + write_unlock_bh(&ip_set_ref_lock); return -EBUSY; + } strncpy(from_name, from->name, IPSET_MAXNAMELEN); strncpy(from->name, to->name, IPSET_MAXNAMELEN); strncpy(to->name, from_name, IPSET_MAXNAMELEN); - write_lock_bh(&ip_set_ref_lock); swap(from->ref, to->ref); ip_set(inst, from_id) = to; ip_set(inst, to_id) = from; -- cgit From 0d18779be13766b33c69cbc26df38383598da373 Mon Sep 17 00:00:00 2001 From: JingPiao Chen Date: Sat, 23 Sep 2017 17:10:44 +0800 Subject: netfilter: nf_tables: fix update chain error # nft add table filter # nft add chain filter c1 # nft rename chain filter c1 c2 Error: Could not process rule: No such file or directory rename chain filter c1 c2 ^^^^^^^^^^^^^^^^^^^^^^^^^^ # nft add chain filter c2 # nft rename chain filter c1 c2 # nft list table filter table ip filter { chain c2 { } chain c2 { } } Fixes: 664b0f8cd8 ("netfilter: nf_tables: add generation mask to chains") Signed-off-by: JingPiao Chen Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 929927171426..f98ca8c6aa59 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1487,8 +1487,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); - if (IS_ERR(chain2)) - return PTR_ERR(chain2); + if (!IS_ERR(chain2)) + return -EEXIST; } if (nla[NFTA_CHAIN_COUNTERS]) { -- cgit From e6b72ee88a56bcfe63f72e9c30766484c45bec72 Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Tue, 26 Sep 2017 18:35:45 +0200 Subject: netfilter: ebtables: fix race condition in frame_filter_net_init() It is possible for ebt_in_hook to be triggered before ebt_table is assigned resulting in a NULL-pointer dereference. Make sure hooks are registered as the last step. Fixes: aee12a0a3727 ("ebtables: remove nf_hook_register usage") Signed-off-by: Artem Savkov Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtable_broute.c | 4 ++-- net/bridge/netfilter/ebtable_filter.c | 4 ++-- net/bridge/netfilter/ebtable_nat.c | 4 ++-- net/bridge/netfilter/ebtables.c | 17 +++++++++-------- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 2585b100ebbb..276b60262981 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -65,8 +65,8 @@ static int ebt_broute(struct sk_buff *skb) static int __net_init broute_net_init(struct net *net) { - net->xt.broute_table = ebt_register_table(net, &broute_table, NULL); - return PTR_ERR_OR_ZERO(net->xt.broute_table); + return ebt_register_table(net, &broute_table, NULL, + &net->xt.broute_table); } static void __net_exit broute_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 45a00dbdbcad..c41da5fac84f 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_filter[] = { static int __net_init frame_filter_net_init(struct net *net) { - net->xt.frame_filter = ebt_register_table(net, &frame_filter, ebt_ops_filter); - return PTR_ERR_OR_ZERO(net->xt.frame_filter); + return ebt_register_table(net, &frame_filter, ebt_ops_filter, + &net->xt.frame_filter); } static void __net_exit frame_filter_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 57cd5bb154e7..08df7406ecb3 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_nat[] = { static int __net_init frame_nat_net_init(struct net *net) { - net->xt.frame_nat = ebt_register_table(net, &frame_nat, ebt_ops_nat); - return PTR_ERR_OR_ZERO(net->xt.frame_nat); + return ebt_register_table(net, &frame_nat, ebt_ops_nat, + &net->xt.frame_nat); } static void __net_exit frame_nat_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 83951f978445..3b3dcf719e07 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1169,9 +1169,8 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) kfree(table); } -struct ebt_table * -ebt_register_table(struct net *net, const struct ebt_table *input_table, - const struct nf_hook_ops *ops) +int ebt_register_table(struct net *net, const struct ebt_table *input_table, + const struct nf_hook_ops *ops, struct ebt_table **res) { struct ebt_table_info *newinfo; struct ebt_table *t, *table; @@ -1183,7 +1182,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table, repl->entries == NULL || repl->entries_size == 0 || repl->counters != NULL || input_table->private != NULL) { BUGPRINT("Bad table data for ebt_register_table!!!\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } /* Don't add one table to multiple lists. */ @@ -1252,16 +1251,18 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table, list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]); mutex_unlock(&ebt_mutex); + WRITE_ONCE(*res, table); + if (!ops) - return table; + return 0; ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret) { __ebt_unregister_table(net, table); - return ERR_PTR(ret); + *res = NULL; } - return table; + return ret; free_unlock: mutex_unlock(&ebt_mutex); free_chainstack: @@ -1276,7 +1277,7 @@ free_newinfo: free_table: kfree(table); out: - return ERR_PTR(ret); + return ret; } void ebt_unregister_table(struct net *net, struct ebt_table *table, -- cgit From fef0035c0f31322d417d1954bba5ab959bf91183 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 28 Sep 2017 00:41:44 +0200 Subject: netlink: do not proceed if dump's start() errs Drivers that use the start method for netlink dumping rely on dumpit not being called if start fails. For example, ila_xlat.c allocates memory and assigns it to cb->args[0] in its start() function. It might fail to do that and return -ENOMEM instead. However, even when returning an error, dumpit will be called, which, in the example above, quickly dereferences the memory in cb->args[0], which will OOPS the kernel. This is but one example of how this goes wrong. Since start() has always been a function with an int return type, it therefore makes sense to use it properly, rather than ignoring it. This patch thus returns early and does not call dumpit() when start() fails. Signed-off-by: Jason A. Donenfeld Cc: Johannes Berg Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 327807731b44..94c11cf0459d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2270,10 +2270,13 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, mutex_unlock(nlk->cb_mutex); + ret = 0; if (cb->start) - cb->start(cb); + ret = cb->start(cb); + + if (!ret) + ret = netlink_dump(sk); - ret = netlink_dump(sk); sock_put(sk); if (ret) -- cgit From d51711c0557d6dbd26c63144aef32c7b3ec264b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:23:31 +0800 Subject: ip_gre: ipgre_tap device should keep dst Without keeping dst, the tunnel will not update any mtu/pmtu info, since it does not have a dst on the skb. Reproducer: client(ipgre_tap1 - eth1) <-----> (eth1 - ipgre_tap1)server After reducing eth1's mtu on client, then perforamnce became 0. This patch is to netif_keep_dst in gre_tap_init, as ipgre does. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0162fb955b33..8b837f6f5532 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1223,6 +1223,7 @@ static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } -- cgit From 2d40557cc702ed8e5edd9bd422233f86652d932e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:23:50 +0800 Subject: ip6_gre: ip6gre_tap device should keep dst The patch 'ip_gre: ipgre_tap device should keep dst' fixed a issue that ipgre_tap mtu couldn't be updated in tx path. The same fix is needed for ip6gre_tap as well. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 20f66f4c9460..1602b491b281 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1311,6 +1311,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], -- cgit From d41bb33ba33b8f8debe54ed36be6925eb496e354 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:24:07 +0800 Subject: ip6_tunnel: update mtu properly for ARPHRD_ETHER tunnel device in tx path Now when updating mtu in tx path, it doesn't consider ARPHRD_ETHER tunnel device, like ip6gre_tap tunnel, for which it should also subtract ether header to get the correct mtu. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index f2f21c24915f..a1c24443cd9e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1043,6 +1043,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; + unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; bool use_cache = false; @@ -1124,7 +1125,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; + mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1133,7 +1134,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; -- cgit From 7487449c86c65202b3b725c4524cb48dd65e4e6f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 28 Sep 2017 15:51:36 +0200 Subject: IPv4: early demux can return an error code Currently no error is emitted, but this infrastructure will used by the next patch to allow source address validation for mcast sockets. Since early demux can do a route lookup and an ipv4 route lookup can return an error code this is consistent with the current ipv4 route infrastructure. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 25 +++++++++++++++---------- net/ipv4/tcp_ipv4.c | 9 +++++---- net/ipv4/udp.c | 11 ++++++----- 3 files changed, 26 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fa2dc8f692c6..57fc13c6ab2b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -311,9 +311,10 @@ drop: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt; + int (*edemux)(struct sk_buff *skb); struct net_device *dev = skb->dev; - void (*edemux)(struct sk_buff *skb); + struct rtable *rt; + int err; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - edemux(skb); + err = edemux(skb); + if (unlikely(err)) + goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } @@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); - if (unlikely(err)) { - if (err == -EXDEV) - __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - goto drop; - } + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, dev); + if (unlikely(err)) + goto drop_error; } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) drop: kfree_skb(skb); return NET_RX_DROP; + +drop_error: + if (err == -EXDEV) + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); + goto drop; } /* diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5162bc..85164d4d3e53 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1503,23 +1503,23 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -void tcp_v4_early_demux(struct sk_buff *skb) +int tcp_v4_early_demux(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) - return; + return 0; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return; + return 0; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) - return; + return 0; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, @@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb_dst_set_noref(skb, dst); } } + return 0; } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef29df8648e4..9b30f821fe96 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2221,7 +2221,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -void udp_v4_early_demux(struct sk_buff *skb) +int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct iphdr *iph; @@ -2234,7 +2234,7 @@ void udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return; + return 0; iph = ip_hdr(skb); uh = udp_hdr(skb); @@ -2244,14 +2244,14 @@ void udp_v4_early_demux(struct sk_buff *skb) struct in_device *in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return; + return 0; /* we are supposed to accept bcast packets */ if (skb->pkt_type == PACKET_MULTICAST) { ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return; + return 0; } sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, @@ -2263,7 +2263,7 @@ void udp_v4_early_demux(struct sk_buff *skb) } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) - return; + return 0; skb->sk = sk; skb->destructor = sock_efree; @@ -2278,6 +2278,7 @@ void udp_v4_early_demux(struct sk_buff *skb) */ skb_dst_set_noref(skb, dst); } + return 0; } int udp_rcv(struct sk_buff *skb) -- cgit From bc044e8db7962e727a75b591b9851ff2ac5cf846 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 28 Sep 2017 15:51:37 +0200 Subject: udp: perform source validation for mcast early demux The UDP early demux can leverate the rx dst cache even for multicast unconnected sockets. In such scenario the ipv4 source address is validated only on the first packet in the given flow. After that, when we fetch the dst entry from the socket rx cache, we stop enforcing the rp_filter and we even start accepting any kind of martian addresses. Disabling the dst cache for unconnected multicast socket will cause large performace regression, nearly reducing by half the max ingress tput. Instead we factor out a route helper to completely validate an skb source address for multicast packets and we call it from the UDP early demux for mcast packets landing on unconnected sockets, after successful fetching the related cached dst entry. This still gives a measurable, but limited performance regression: rp_filter = 0 rp_filter = 1 edmux disabled: 1182 Kpps 1127 Kpps edmux before: 2238 Kpps 2238 Kpps edmux after: 2037 Kpps 2019 Kpps The above figures are on top of current net tree. Applying the net-next commit 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.") the delta with rp_filter == 0 will decrease even more. Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/route.c | 46 ++++++++++++++++++++++++++-------------------- net/ipv4/udp.c | 13 ++++++++++++- 2 files changed, 38 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 94d4cd2d5ea4..ac6fde5d45f1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev, EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { - struct rtable *rth; - struct in_device *in_dev = __in_dev_get_rcu(dev); - unsigned int flags = RTCF_MULTICAST; - u32 itag = 0; int err; /* Primary sanity checks. */ - if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) - goto e_inval; + return -EINVAL; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - goto e_inval; + return -EINVAL; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) - goto e_inval; + return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, &itag); + in_dev, itag); if (err < 0) - goto e_err; + return err; } + return 0; +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, int our) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; + struct rtable *rth; + u32 itag = 0; + int err; + + err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); + if (err) + return err; + if (our) flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) - goto e_nobufs; + return -ENOBUFS; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_set(skb, &rth->dst); return 0; - -e_nobufs: - return -ENOBUFS; -e_inval: - return -EINVAL; -e_err: - return err; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9b30f821fe96..5676237d2b0f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2224,6 +2224,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; @@ -2241,7 +2242,7 @@ int udp_v4_early_demux(struct sk_buff *skb) if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return 0; @@ -2272,11 +2273,21 @@ int udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { + u32 itag = 0; + /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); + + /* for unconnected multicast sockets we need to validate + * the source on each packet + */ + if (!inet_sk(sk)->inet_daddr && in_dev) + return ip_mc_validate_source(skb, iph->daddr, + iph->saddr, iph->tos, + skb->dev, in_dev, &itag); } return 0; } -- cgit From aad06212d36cf34859428a0a279e5c14ee5c9e26 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Fri, 29 Sep 2017 10:02:54 +0200 Subject: tipc: use only positive error codes in messages In commit e3a77561e7d32 ("tipc: split up function tipc_msg_eval()"), we have updated the function tipc_msg_lookup_dest() to set the error codes to negative values at destination lookup failures. Thus when the function sets the error code to -TIPC_ERR_NO_NAME, its inserted into the 4 bit error field of the message header as 0xf instead of TIPC_ERR_NO_NAME (1). The value 0xf is an unknown error code. In this commit, we set only positive error code. Fixes: e3a77561e7d32 ("tipc: split up function tipc_msg_eval()") Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/msg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 6ef379f004ac..121e59a1d0e7 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -551,7 +551,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) return false; if (msg_errcode(msg)) return false; - *err = -TIPC_ERR_NO_NAME; + *err = TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; msg = buf_msg(skb); -- cgit From d099b8af46f5e1e37182eff988f9373dcc2b0128 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 12:21:14 +0100 Subject: sunrpc: remove redundant initialization of sock sock is being initialized and then being almost immediately updated hence the initialized value is not being used and is redundant. Remove the initialization. Cleans up clang warning: warning: Value stored to 'sock' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9b5de31aa429..c1841f234a71 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2203,7 +2203,7 @@ static void xs_udp_setup_socket(struct work_struct *work) struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; - struct socket *sock = transport->sock; + struct socket *sock; int status = -EIO; sock = xs_create_sock(xprt, transport, -- cgit From 935a9749a36828af0e8be224a5cd4bc758112c34 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:53 +0800 Subject: ip_gre: get key from session_id correctly in erspan_rcv erspan only uses the first 10 bits of session_id as the key to look up the tunnel. But in erspan_rcv, it missed 'session_id & ID_MASK' when getting the key from session_id. If any other flag is also set in session_id in a packet, it would fail to find the tunnel due to incorrect key in erspan_rcv. This patch is to add 'session_id & ID_MASK' there and also remove the unnecessary variable session_id. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8b837f6f5532..b25b1e5112d0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel *tunnel; struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 session_id; __be32 index; int len; @@ -275,8 +274,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ - session_id = cpu_to_be32(ntohs(ershdr->session_id)); - tpi->key = session_id; + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); index = ershdr->md.index; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, -- cgit From 5513d08d29511c263c00933c00dd7a82fffda3c9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:54 +0800 Subject: ip_gre: check packet length and mtu correctly in erspan_xmit As a ARPHRD_ETHER device, skb->len in erspan_xmit is the length of the whole ether packet. So before checking if a packet size exceeds the mtu, skb->len should subtract dev->hard_header_len. Otherwise, all packets with max size according to mtu would be trimmed to be truncated packet. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index b25b1e5112d0..2a4ef9dc48ff 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -731,7 +731,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len > dev->mtu) { + if (skb->len - dev->hard_header_len > dev->mtu) { pskb_trim(skb, dev->mtu); truncate = true; } -- cgit From c122fda271717f4fc618e0a31e833941fd5f1efd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:55 +0800 Subject: ip_gre: set tunnel hlen properly in erspan_tunnel_init According to __gre_tunnel_init, tunnel->hlen should be set as the headers' length between inner packet and outer iphdr. It would be used especially to calculate a proper mtu when updating mtu in tnl_update_pmtu. Now without setting it, a bigger mtu value than expected would be updated, which hurts performance a lot. This patch is to fix it by setting tunnel->hlen with: tunnel->tun_hlen + tunnel->encap_hlen + sizeof(struct erspanhdr) Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2a4ef9dc48ff..fad0bb1e3e9a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1245,7 +1245,9 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; - t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + sizeof(struct erspanhdr); + t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; dev->mtu = ETH_DATA_LEN - t_hlen - 4; -- cgit From c84bed440e4e11a973e8c0254d0dfaccfca41fb0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:56 +0800 Subject: ip_gre: erspan device should keep dst The patch 'ip_gre: ipgre_tap device should keep dst' fixed the issue ipgre_tap dev mtu couldn't be updated in tx path. The same fix is needed for erspan as well. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fad0bb1e3e9a..467e44d7587d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1254,6 +1254,7 @@ static int erspan_tunnel_init(struct net_device *dev) dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } -- cgit From 9f775ead5e570e7e19015b9e4e2f3dd6e71a5935 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 28 Sep 2017 15:44:38 +0200 Subject: l2tp: fix l2tp_eth module loading The l2tp_eth module crashes if its netlink callbacks are run when the pernet data aren't initialised. We should normally register_pernet_device() before the genl callbacks. However, the pernet data only maintain a list of l2tpeth interfaces, and this list is never used. So let's just drop pernet handling instead. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 51 ++------------------------------------------------- 1 file changed, 2 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 87da9ef61860..014a7bc2a872 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -44,7 +44,6 @@ struct l2tp_eth { struct net_device *dev; struct sock *tunnel_sock; struct l2tp_session *session; - struct list_head list; atomic_long_t tx_bytes; atomic_long_t tx_packets; atomic_long_t tx_dropped; @@ -58,17 +57,6 @@ struct l2tp_eth_sess { struct net_device *dev; }; -/* per-net private data for this module */ -static unsigned int l2tp_eth_net_id; -struct l2tp_eth_net { - struct list_head l2tp_eth_dev_list; - spinlock_t l2tp_eth_lock; -}; - -static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) -{ - return net_generic(net, l2tp_eth_net_id); -} static int l2tp_eth_dev_init(struct net_device *dev) { @@ -84,12 +72,6 @@ static int l2tp_eth_dev_init(struct net_device *dev) static void l2tp_eth_dev_uninit(struct net_device *dev) { - struct l2tp_eth *priv = netdev_priv(dev); - struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev)); - - spin_lock(&pn->l2tp_eth_lock); - list_del_init(&priv->list); - spin_unlock(&pn->l2tp_eth_lock); dev_put(dev); } @@ -273,7 +255,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, struct l2tp_eth *priv; struct l2tp_eth_sess *spriv; int rc; - struct l2tp_eth_net *pn; if (cfg->ifname) { strlcpy(name, cfg->ifname, IFNAMSIZ); @@ -305,7 +286,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, priv = netdev_priv(dev); priv->dev = dev; priv->session = session; - INIT_LIST_HEAD(&priv->list); priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; @@ -326,10 +306,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, strlcpy(session->ifname, dev->name, IFNAMSIZ); dev_hold(dev); - pn = l2tp_eth_pernet(dev_net(dev)); - spin_lock(&pn->l2tp_eth_lock); - list_add(&priv->list, &pn->l2tp_eth_dev_list); - spin_unlock(&pn->l2tp_eth_lock); return 0; @@ -342,22 +318,6 @@ out: return rc; } -static __net_init int l2tp_eth_init_net(struct net *net) -{ - struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id); - - INIT_LIST_HEAD(&pn->l2tp_eth_dev_list); - spin_lock_init(&pn->l2tp_eth_lock); - - return 0; -} - -static struct pernet_operations l2tp_eth_net_ops = { - .init = l2tp_eth_init_net, - .id = &l2tp_eth_net_id, - .size = sizeof(struct l2tp_eth_net), -}; - static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { .session_create = l2tp_eth_create, @@ -371,25 +331,18 @@ static int __init l2tp_eth_init(void) err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops); if (err) - goto out; - - err = register_pernet_device(&l2tp_eth_net_ops); - if (err) - goto out_unreg; + goto err; pr_info("L2TP ethernet pseudowire support (L2TPv3)\n"); return 0; -out_unreg: - l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); -out: +err: return err; } static void __exit l2tp_eth_exit(void) { - unregister_pernet_device(&l2tp_eth_net_ops); l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); } -- cgit From eefca20eb20c66b06cf5ed09b49b1a7caaa27b7b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Oct 2017 12:20:51 -0700 Subject: socket, bpf: fix possible use after free Starting from linux-4.4, 3WHS no longer takes the listener lock. Since this time, we might hit a use-after-free in sk_filter_charge(), if the filter we got in the memcpy() of the listener content just happened to be replaced by a thread changing listener BPF filter. To fix this, we need to make sure the filter refcount is not already zero before incrementing it again. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 12 ++++++++---- net/core/sock.c | 5 ++++- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 82edad58d066..74b8c91fb5f4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -989,10 +989,14 @@ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - bool ret = __sk_filter_charge(sk, fp); - if (ret) - refcount_inc(&fp->refcnt); - return ret; + if (!refcount_inc_not_zero(&fp->refcnt)) + return false; + + if (!__sk_filter_charge(sk, fp)) { + sk_filter_release(fp); + return false; + } + return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) diff --git a/net/core/sock.c b/net/core/sock.c index 7d55c05f449d..23953b741a41 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1684,13 +1684,16 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); - filter = rcu_dereference_protected(newsk->sk_filter, 1); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new -- cgit From e63aaaa6be54c956b9603590ea436b003407bb3e Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 20 Sep 2017 12:31:28 +0530 Subject: netfilter: nf_tables: Release memory obtained by kasprintf Free memory region, if nf_tables_set_alloc_name is not successful. Fixes: 387454901bd6 ("netfilter: nf_tables: Allow set names of up to 255 chars") Signed-off-by: Arvind Yadav Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f98ca8c6aa59..34adedcb239e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2741,8 +2741,10 @@ cont: list_for_each_entry(i, &ctx->table->sets, list) { if (!nft_is_active_next(ctx->net, i)) continue; - if (!strcmp(set->name, i->name)) + if (!strcmp(set->name, i->name)) { + kfree(set->name); return -ENFILE; + } } return 0; } -- cgit From ce024f42c2e28b6bce4ecc1e891b42f57f753892 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 3 Oct 2017 13:20:48 +0300 Subject: net: rtnetlink: fix info leak in RTM_GETSTATS call When RTM_GETSTATS was added the fields of its header struct were not all initialized when returning the result thus leaking 4 bytes of information to user-space per rtnl_fill_statsinfo call, so initialize them now. Thanks to Alexander Potapenko for the detailed report and bisection. Reported-by: Alexander Potapenko Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump link stats") Signed-off-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a78fd61da0ec..d4bcdcc68e92 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3854,6 +3854,9 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, return -EMSGSIZE; ifsm = nlmsg_data(nlh); + ifsm->family = PF_UNSPEC; + ifsm->pad1 = 0; + ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; -- cgit From ad670233c9e1d5feb365d870e30083ef1b889177 Mon Sep 17 00:00:00 2001 From: Peng Xu Date: Tue, 3 Oct 2017 23:21:51 +0300 Subject: nl80211: Define policy for packet pattern attributes Define a policy for packet pattern attributes in order to fix a potential read over the end of the buffer during nla_get_u32() of the NL80211_PKTPAT_OFFSET attribute. Note that the data there can always be read due to SKB allocation (with alignment and struct skb_shared_info at the end), but the data might be uninitialized. This could be used to leak some data from uninitialized vmalloc() memory, but most drivers don't allow an offset (so you'd just get -EINVAL if the data is non-zero) or just allow it with a fixed value - 100 or 128 bytes, so anything above that would get -EINVAL. With brcmfmac the limit is 1500 so (at least) one byte could be obtained. Cc: stable@kernel.org Signed-off-by: Peng Xu Signed-off-by: Jouni Malinen [rewrite description based on SKB allocation knowledge] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 690874293cfc..d396cb61a280 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -549,6 +549,14 @@ nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = { [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED }, }; +/* policy for packet pattern attributes */ +static const struct nla_policy +nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = { + [NL80211_PKTPAT_MASK] = { .type = NLA_BINARY, }, + [NL80211_PKTPAT_PATTERN] = { .type = NLA_BINARY, }, + [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 }, +}; + static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg80211_registered_device **rdev, @@ -10532,7 +10540,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) u8 *mask_pat; nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - NULL, info->extack); + nl80211_packet_pattern_policy, + info->extack); err = -EINVAL; if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) @@ -10781,7 +10790,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, rem) { u8 *mask_pat; - nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL, NULL); + nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, + nl80211_packet_pattern_policy, NULL); if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) return -EINVAL; -- cgit From e769fcec6bc4bdd1b0e2cf817680148f9c40b1c4 Mon Sep 17 00:00:00 2001 From: Vishakha Narvekar Date: Tue, 3 Oct 2017 16:13:29 -0400 Subject: net: 8021q: skip packets if the vlan is down If the vlan is down, free the packet instead of proceeding with other processing, or counting it as received. If vlan interfaces are used as slaves for bonding, with arp monitoring for connectivity, if the rx counter is seen to be incrementing, then the bond device will not observe that the interface is down. CC: David S. Miller Signed-off-by: Vishakha Narvekar Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index e2ed69850489..0bc31de9071a 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -21,6 +21,12 @@ bool vlan_do_receive(struct sk_buff **skbp) if (unlikely(!skb)) return false; + if (unlikely(!(vlan_dev->flags & IFF_UP))) { + kfree_skb(skb); + *skbp = NULL; + return false; + } + skb->dev = vlan_dev; if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { /* Our lower layer thinks this is not local, let's make sure. -- cgit From 5f9bfe0ef622a7bb9707c22ceb4b6451e1e2cb7b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 4 Oct 2017 17:18:27 +0200 Subject: netfilter: nf_tables: do not dump chain counters if not enabled Chain counters are only enabled on demand since 9f08ea848117, skip them when dumping them via netlink. Fixes: 9f08ea848117 ("netfilter: nf_tables: keep chain counters away from hot path") Reported-by: Johny Mattsson Tested-by: Johny Mattsson Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 34adedcb239e..64e1ee091225 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1048,7 +1048,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) goto nla_put_failure; - if (nft_dump_stats(skb, nft_base_chain(chain)->stats)) + if (basechain->stats && nft_dump_stats(skb, basechain->stats)) goto nla_put_failure; } -- cgit From e466af75c074e76107ae1cd5a2823e9c61894ffb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 02:50:07 -0700 Subject: netfilter: x_tables: avoid stack-out-of-bounds read in xt_copy_counters_from_user syzkaller reports an out of bound read in strlcpy(), triggered by xt_copy_counters_from_user() Fix this by using memcpy(), then forcing a zero byte at the last position of the destination, as Florian did for the non COMPAT code. Fixes: d7591f0c41ce ("netfilter: x_tables: introduce and use xt_copy_counters_from_user") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c83a3b5e1c6c..d8571f414208 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -892,7 +892,7 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0) return ERR_PTR(-EFAULT); - strlcpy(info->name, compat_tmp.name, sizeof(info->name)); + memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1); info->num_counters = compat_tmp.num_counters; user += sizeof(compat_tmp); } else @@ -905,9 +905,9 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (copy_from_user(info, user, sizeof(*info)) != 0) return ERR_PTR(-EFAULT); - info->name[sizeof(info->name) - 1] = '\0'; user += sizeof(*info); } + info->name[sizeof(info->name) - 1] = '\0'; size = sizeof(struct xt_counters); size *= info->num_counters; -- cgit From a2d3f3e33853ef52e5f66b41c3e8ee5710aa3305 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 5 Oct 2017 19:03:05 +0200 Subject: ipv6: fix net.ipv6.conf.all.accept_dad behaviour for real Commit 35e015e1f577 ("ipv6: fix net.ipv6.conf.all interface DAD handlers") was intended to affect accept_dad flag handling in such a way that DAD operation and mode on a given interface would be selected according to the maximum value of conf/{all,interface}/accept_dad. However, addrconf_dad_begin() checks for particular cases in which we need to skip DAD, and this check was modified in the wrong way. Namely, it was modified so that, if the accept_dad flag is 0 for the given interface *or* for all interfaces, DAD would be skipped. We have instead to skip DAD if accept_dad is 0 for the given interface *and* for all interfaces. Fixes: 35e015e1f577 ("ipv6: fix net.ipv6.conf.all interface DAD handlers") Acked-by: Stefano Brivio Signed-off-by: Matteo Croce Reported-by: Erik Kline Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 96861c702c06..4a96ebbf8eda 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3820,8 +3820,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - dev_net(dev)->ipv6.devconf_all->accept_dad < 1 || - idev->cnf.accept_dad < 1 || + (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 && + idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { bump_id = ifp->flags & IFA_F_TENTATIVE; -- cgit From 3d0241d57c7b25bb75ac9d7a62753642264fdbce Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 6 Oct 2017 19:02:35 +0300 Subject: gso: fix payload length when gso_size is zero When gso_size reset to zero for the tail segment in skb_segment(), later in ipv6_gso_segment(), __skb_udp_tunnel_segment() and gre_gso_segment() we will get incorrect results (payload length, pcsum) for that segment. inet_gso_segment() already has a check for gso_size before calculating payload. The issue was found with LTP vxlan & gre tests over ixgbe NIC. Fixes: 07b26c9454a2 ("gso: Support partial splitting at the frag_list pointer") Signed-off-by: Alexey Kodanev Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/gre_offload.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/ip6_offload.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 416bb304a281..1859c473b21a 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -86,7 +86,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { unsigned int partial_adj; /* Adjust checksum to account for the fact that diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 97658bfc1b58..e360d55be555 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -120,7 +120,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index cdb3728faca7..4a87f9428ca5 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (gso_partial) + if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); -- cgit From 3382605fd8db1ed1fb03f3f1529490133fe3ab08 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Sat, 7 Oct 2017 14:32:49 +0200 Subject: tipc: correct initialization of skb list We change the initialization of the skb transmit buffer queues in the functions tipc_bcast_xmit() and tipc_rcast_xmit() to also initialize their spinlocks. This is needed because we may, during error conditions, need to call skb_queue_purge() on those queues further down the stack. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 7d99029df342..a140dd4a84af 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -233,7 +233,7 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, struct sk_buff_head xmitq; int rc = 0; - __skb_queue_head_init(&xmitq); + skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (tipc_link_bc_peers(l)) rc = tipc_link_xmit(l, pkts, &xmitq); @@ -263,7 +263,7 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, u32 dst, selector; selector = msg_link_selector(buf_msg(skb_peek(pkts))); - __skb_queue_head_init(&_pkts); + skb_queue_head_init(&_pkts); list_for_each_entry_safe(n, tmp, &dests->list, list) { dst = n->value; -- cgit From a9e2971b8cd3ef469de0112ba15778b5b98ad72e Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Sat, 7 Oct 2017 15:07:20 +0200 Subject: tipc: Unclone message at secondary destination lookup When a bundling message is received, the function tipc_link_input() calls function tipc_msg_extract() to unbundle all inner messages of the bundling message before adding them to input queue. The function tipc_msg_extract() just clones all inner skb for all inner messagges from the bundling skb. This means that the skb headroom of an inner message overlaps with the data part of the preceding message in the bundle. If the message in question is a name addressed message, it may be subject to a secondary destination lookup, and eventually be sent out on one of the interfaces again. But, since what is perceived as headroom by the device driver in reality is the last bytes of the preceding message in the bundle, the latter will be overwritten by the MAC addresses of the L2 header. If the preceding message has not yet been consumed by the user, it will evenually be delivered with corrupted contents. This commit fixes this by uncloning all messages passing through the function tipc_msg_lookup_dest(), hence ensuring that the headroom is always valid when the message is passed on. Signed-off-by: Tung Nguyen Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 121e59a1d0e7..17146c16ee2d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -568,6 +568,14 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) msg_set_destnode(msg, dnode); msg_set_destport(msg, dport); *err = TIPC_OK; + + if (!skb_cloned(skb)) + return true; + + /* Unclone buffer in case it was bundled */ + if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) + return false; + return true; } -- cgit From 49f817d793d1bcc11d721881aac037b996feef5c Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Fri, 6 Oct 2017 00:44:03 +0800 Subject: netfilter: SYNPROXY: skip non-tcp packet in {ipv4, ipv6}_synproxy_hook In function {ipv4,ipv6}_synproxy_hook we expect a normal tcp packet, but the real server maybe reply an icmp error packet related to the exist tcp conntrack, so we will access wrong tcp data. Fix it by checking for the protocol field and only process tcp traffic. Signed-off-by: Lin Zhang Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_SYNPROXY.c | 3 ++- net/ipv6/netfilter/ip6t_SYNPROXY.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 811689e523c3..f75fc6b53115 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, if (synproxy == NULL) return NF_ACCEPT; - if (nf_is_loopback_packet(skb)) + if (nf_is_loopback_packet(skb) || + ip_hdr(skb)->protocol != IPPROTO_TCP) return NF_ACCEPT; thoff = ip_hdrlen(skb); diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index a5cd43d75393..437af8c95277 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -353,7 +353,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, nexthdr = ipv6_hdr(skb)->nexthdr; thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (thoff < 0) + if (thoff < 0 || nexthdr != IPPROTO_TCP) return NF_ACCEPT; th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); -- cgit From 98589a0998b8b13c4a8fa1ccb0e62751a019faa5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 9 Oct 2017 15:27:15 +0300 Subject: netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1' Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced support for attaching an eBPF object by an fd, with the 'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each IPT_SO_SET_REPLACE call. However this breaks subsequent iptables calls: # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT # iptables -A INPUT -s 5.6.7.8 -j ACCEPT iptables: Invalid argument. Run `dmesg' for more information. That's because iptables works by loading existing rules using IPT_SO_GET_ENTRIES to userspace, then issuing IPT_SO_SET_REPLACE with the replacement set. However, the loaded 'xt_bpf_info_v1' has an arbitrary '.fd' number (from the initial "iptables -m bpf" invocation) - so when 2nd invocation occurs, userspace passes a bogus fd number, which leads to 'bpf_mt_check_v1' to fail. One suggested solution [1] was to hack iptables userspace, to perform a "entries fixup" immediatley after IPT_SO_GET_ENTRIES, by opening a new, process-local fd per every 'xt_bpf_info_v1' entry seen. However, in [2] both Pablo Neira Ayuso and Willem de Bruijn suggested to depricate the xt_bpf_info_v1 ABI dealing with pinned ebpf objects. This fix changes the XT_BPF_MODE_FD_PINNED behavior to ignore the given '.fd' and instead perform an in-kernel lookup for the bpf object given the provided '.path'. It also defines an alias for the XT_BPF_MODE_FD_PINNED mode, named XT_BPF_MODE_PATH_PINNED, to better reflect the fact that the user is expected to provide the path of the pinned object. Existing XT_BPF_MODE_FD_ELF behavior (non-pinned fd mode) is preserved. References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2 [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2 Reported-by: Rafael Buchbinder Signed-off-by: Shmulik Ladkani Acked-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_bpf.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 38986a95216c..29123934887b 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) return 0; } +static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) +{ + mm_segment_t oldfs = get_fs(); + int retval, fd; + + set_fs(KERNEL_DS); + fd = bpf_obj_get_user(path); + set_fs(oldfs); + if (fd < 0) + return fd; + + retval = __bpf_mt_check_fd(fd, ret); + sys_close(fd); + return retval; +} + static int bpf_mt_check(const struct xt_mtchk_param *par) { struct xt_bpf_info *info = par->matchinfo; @@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par) return __bpf_mt_check_bytecode(info->bpf_program, info->bpf_program_num_elem, &info->filter); - else if (info->mode == XT_BPF_MODE_FD_PINNED || - info->mode == XT_BPF_MODE_FD_ELF) + else if (info->mode == XT_BPF_MODE_FD_ELF) return __bpf_mt_check_fd(info->fd, &info->filter); + else if (info->mode == XT_BPF_MODE_PATH_PINNED) + return __bpf_mt_check_path(info->path, &info->filter); else return -EINVAL; } -- cgit From 62cf27e52b8c9a39066172ca6b6134cb5eaa9450 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 Oct 2017 08:39:43 +0200 Subject: ipv6: Fix traffic triggered IPsec connections. A recent patch removed the dst_free() on the allocated dst_entry in ipv6_blackhole_route(). The dst_free() marked the dst_entry as dead and added it to the gc list. I.e. it was setup for a one time usage. As a result we may now have a blackhole route cached at a socket on some IPsec scenarios. This makes the connection unusable. Fix this by marking the dst_entry directly at allocation time as 'dead', so it is used only once. Fixes: 587fea741134 ("ipv6: mark DST_NOGC and remove the operation of dst_free()") Reported-by: Tobias Brunner Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 26cc9f483b6d..a96d5b385d8f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1325,7 +1325,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, - DST_OBSOLETE_NONE, 0); + DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); -- cgit From 6c0e7284d89995877740d8a26c3e99a937312a3c Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 Oct 2017 08:43:55 +0200 Subject: ipv4: Fix traffic triggered IPsec connections. A recent patch removed the dst_free() on the allocated dst_entry in ipv4_blackhole_route(). The dst_free() marked the dst_entry as dead and added it to the gc list. I.e. it was setup for a one time usage. As a result we may now have a blackhole route cached at a socket on some IPsec scenarios. This makes the connection unusable. Fix this by marking the dst_entry directly at allocation time as 'dead', so it is used only once. Fixes: b838d5e1c5b6 ("ipv4: mark DST_NOGC and remove the operation of dst_free()") Reported-by: Tobias Brunner Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ac6fde5d45f1..3d9f1c2f81c5 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2513,7 +2513,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or struct rtable *ort = (struct rtable *) dst_orig; struct rtable *rt; - rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; -- cgit From 41c87425a1ac9b633e0fcc78eb1f19640c8fb5a0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 9 Oct 2017 14:14:51 +0200 Subject: netlink: do not set cb_running if dump's start() errs It turns out that multiple places can call netlink_dump(), which means it's still possible to dereference partially initialized values in dump() that were the result of a faulty returned start(). This fixes the issue by calling start() _before_ setting cb_running to true, so that there's no chance at all of hitting the dump() function through any indirect paths. It also moves the call to start() to be when the mutex is held. This has the nice side effect of serializing invocations to start(), which is likely desirable anyway. It also prevents any possible other races that might come out of this logic. In testing this with several different pieces of tricky code to trigger these issues, this commit fixes all avenues that I'm aware of. Signed-off-by: Jason A. Donenfeld Cc: Johannes Berg Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 94c11cf0459d..f34750691c5c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2266,16 +2266,17 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; + if (cb->start) { + ret = cb->start(cb); + if (ret) + goto error_unlock; + } + nlk->cb_running = true; mutex_unlock(nlk->cb_mutex); - ret = 0; - if (cb->start) - ret = cb->start(cb); - - if (!ret) - ret = netlink_dump(sk); + ret = netlink_dump(sk); sock_put(sk); -- cgit From 996b44fcef8f216ea0b6b6e74468c5a77b5e341f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 9 Oct 2017 14:52:10 +0200 Subject: udp: fix bcast packet reception The commit bc044e8db796 ("udp: perform source validation for mcast early demux") does not take into account that broadcast packets lands in the same code path and they need different checks for the source address - notably, zero source address are valid for bcast and invalid for mcast. As a result, 2nd and later broadcast packets with 0 source address landing to the same socket are dropped. This breaks dhcp servers. Since we don't have stringent performance requirements for ingress broadcast traffic, fix it by disabling UDP early demux such traffic. Reported-by: Hannes Frederic Sowa Fixes: bc044e8db796 ("udp: perform source validation for mcast early demux") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5676237d2b0f..e45177ceb0ee 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2240,20 +2240,16 @@ int udp_v4_early_demux(struct sk_buff *skb) iph = ip_hdr(skb); uh = udp_hdr(skb); - if (skb->pkt_type == PACKET_BROADCAST || - skb->pkt_type == PACKET_MULTICAST) { + if (skb->pkt_type == PACKET_MULTICAST) { in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return 0; - /* we are supposed to accept bcast packets */ - if (skb->pkt_type == PACKET_MULTICAST) { - ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, - iph->protocol); - if (!ours) - return 0; - } + ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, + iph->protocol); + if (!ours) + return 0; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, -- cgit From 9f1c2674b328a69ab5a9b5a1c52405795ee4163f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Oct 2017 21:44:51 -0700 Subject: net: memcontrol: defer call to mem_cgroup_sk_alloc() Instead of calling mem_cgroup_sk_alloc() from BH context, it is better to call it from inet_csk_accept() in process context. Not only this removes code in mem_cgroup_sk_alloc(), but it also fixes a bug since listener might have been dismantled and css_get() might cause a use-after-free. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Eric Dumazet Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: David S. Miller --- net/core/sock.c | 5 ++++- net/ipv4/inet_connection_sock.c | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 23953b741a41..70c6ccbdf49f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1677,6 +1677,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; + + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; + atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; @@ -1714,7 +1718,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); - mem_cgroup_sk_alloc(newsk); cgroup_sk_alloc(&newsk->sk_cgrp_data); /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c039c937ba90..67aec7a10686 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -475,6 +475,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) } spin_unlock_bh(&queue->fastopenq.lock); } + mem_cgroup_sk_alloc(newsk); out: release_sock(sk); if (req) -- cgit From fbb1fb4ad415cb31ce944f65a5ca700aaf73a227 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Oct 2017 21:44:52 -0700 Subject: net: defer call to cgroup_sk_alloc() sk_clone_lock() might run while TCP/DCCP listener already vanished. In order to prevent use after free, it is better to defer cgroup_sk_alloc() to the point we know both parent and child exist, and from process context. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Eric Dumazet Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: David S. Miller --- net/core/sock.c | 3 +-- net/ipv4/inet_connection_sock.c | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 70c6ccbdf49f..4499e3153813 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1680,6 +1680,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; + memset(&newsk->sk_cgrp_data, 0, sizeof(newsk->sk_cgrp_data)); atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; @@ -1718,8 +1719,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); - cgroup_sk_alloc(&newsk->sk_cgrp_data); - /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 67aec7a10686..d32c74507314 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -476,6 +478,9 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) spin_unlock_bh(&queue->fastopenq.lock); } mem_cgroup_sk_alloc(newsk); + cgroup_sk_alloc(&newsk->sk_cgrp_data); + sock_update_classid(&newsk->sk_cgrp_data); + sock_update_netprioidx(&newsk->sk_cgrp_data); out: release_sock(sk); if (req) -- cgit From 75cb070960ade40fba5de32138390f3c85c90941 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Oct 2017 19:12:32 -0700 Subject: Revert "net: defer call to cgroup_sk_alloc()" This reverts commit fbb1fb4ad415cb31ce944f65a5ca700aaf73a227. This was not the proper fix, lets cleanly revert it, so that following patch can be carried to stable versions. sock_cgroup_ptr() callers do not expect a NULL return value. Signed-off-by: Eric Dumazet Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: David S. Miller --- net/core/sock.c | 3 ++- net/ipv4/inet_connection_sock.c | 5 ----- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 4499e3153813..70c6ccbdf49f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1680,7 +1680,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; - memset(&newsk->sk_cgrp_data, 0, sizeof(newsk->sk_cgrp_data)); atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; @@ -1719,6 +1718,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + cgroup_sk_alloc(&newsk->sk_cgrp_data); + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d32c74507314..67aec7a10686 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -26,8 +26,6 @@ #include #include #include -#include -#include #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -478,9 +476,6 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) spin_unlock_bh(&queue->fastopenq.lock); } mem_cgroup_sk_alloc(newsk); - cgroup_sk_alloc(&newsk->sk_cgrp_data); - sock_update_classid(&newsk->sk_cgrp_data); - sock_update_netprioidx(&newsk->sk_cgrp_data); out: release_sock(sk); if (req) -- cgit From c0576e3975084d4699b7bfef578613fb8e1144f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Oct 2017 19:12:33 -0700 Subject: net: call cgroup_sk_alloc() earlier in sk_clone_lock() If for some reason, the newly allocated child need to be freed, we will call cgroup_put() (via sk_free_unlock_clone()) while the corresponding cgroup_get() was not yet done, and we will free memory too soon. Fixes: d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") Signed-off-by: Eric Dumazet Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: David S. Miller --- net/core/sock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 70c6ccbdf49f..415f441c63b9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1687,6 +1687,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); + cgroup_sk_alloc(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); @@ -1718,8 +1719,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); - cgroup_sk_alloc(&newsk->sk_cgrp_data); - /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) -- cgit From 10a7ef33679073d13bf1dd05e3f1b7912f999543 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 10 Oct 2017 20:59:38 -0700 Subject: ipsec: Fix dst leak in xfrm_bundle_create(). If we cannot find a suitable inner_mode value, we will leak the currently allocated 'xdst'. The fix is to make sure it is linked into the chain before erroring out. Signed-off-by: David S. Miller Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f06253969972..2746b62a8944 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1573,6 +1573,14 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto put_states; } + if (!dst_prev) + dst0 = dst1; + else + /* Ref count is taken during xfrm_alloc_dst() + * No need to do dst_clone() on dst1 + */ + dst_prev->child = dst1; + if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], xfrm_af2proto(family)); @@ -1584,14 +1592,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, } else inner_mode = xfrm[i]->inner_mode; - if (!dst_prev) - dst0 = dst1; - else - /* Ref count is taken during xfrm_alloc_dst() - * No need to do dst_clone() on dst1 - */ - dst_prev->child = dst1; - xdst->route = dst; dst_copy_metrics(dst1, dst); -- cgit From 6e9c0075409d4ec1bc63558ee5a93916a6d7d16f Mon Sep 17 00:00:00 2001 From: Samuel Mendoza-Jonas Date: Wed, 11 Oct 2017 16:54:27 +1100 Subject: net/ncsi: Don't limit vids based on hot_channel Currently we drop any new VLAN ids if there are more than the current (or last used) channel can support. Most importantly this is a problem if no channel has been selected yet, resulting in a segfault. Secondly this does not necessarily reflect the capabilities of any other channels. Instead only drop a new VLAN id if we are already tracking the maximum allowed by the NCSI specification. Per-channel limits are already handled by ncsi_add_filter(), but add a message to set_one_vid() to make it obvious that the channel can not support any more VLAN ids. Signed-off-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- net/ncsi/internal.h | 1 + net/ncsi/ncsi-manage.c | 17 +++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index af3d636534ef..d30f7bd741d0 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -286,6 +286,7 @@ struct ncsi_dev_priv { struct work_struct work; /* For channel management */ struct packet_type ptype; /* NCSI packet Rx handler */ struct list_head node; /* Form NCSI device list */ +#define NCSI_MAX_VLAN_VIDS 15 struct list_head vlan_vids; /* List of active VLAN IDs */ }; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 3fd3c39e6278..b6a449aa9d4b 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -732,6 +732,10 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, if (index < 0) { netdev_err(ndp->ndev.dev, "Failed to add new VLAN tag, error %d\n", index); + if (index == -ENOSPC) + netdev_err(ndp->ndev.dev, + "Channel %u already has all VLAN filters set\n", + nc->id); return -1; } @@ -1403,7 +1407,6 @@ static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { - struct ncsi_channel_filter *ncf; struct ncsi_dev_priv *ndp; unsigned int n_vids = 0; struct vlan_vid *vlan; @@ -1420,7 +1423,6 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) } ndp = TO_NCSI_DEV_PRIV(nd); - ncf = ndp->hot_channel->filters[NCSI_FILTER_VLAN]; /* Add the VLAN id to our internal list */ list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) { @@ -1431,12 +1433,11 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) return 0; } } - - if (n_vids >= ncf->total) { - netdev_info(dev, - "NCSI Channel supports up to %u VLAN tags but %u are already set\n", - ncf->total, n_vids); - return -EINVAL; + if (n_vids >= NCSI_MAX_VLAN_VIDS) { + netdev_warn(dev, + "tried to add vlan id %u but NCSI max already registered (%u)\n", + vid, NCSI_MAX_VLAN_VIDS); + return -ENOSPC; } vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); -- cgit From 12ed3772b7e11b53a58ecbd8ce258271fb148cc6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 11 Oct 2017 20:10:31 -0700 Subject: ip: update policy routing config help The kernel config help for policy routing was still pointing at an ancient document from 2000 that refers to Linux 2.1. Update it to point to something that is at least occasionally updated. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 91a2557942fa..f48fe6fc7e8c 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -70,11 +70,9 @@ config IP_MULTIPLE_TABLES address into account. Furthermore, the TOS (Type-Of-Service) field of the packet can be used for routing decisions as well. - If you are interested in this, please see the preliminary - documentation at - and . - You will need supporting software from - . + If you need more information, see the Linux Advanced + Routing and Traffic Control documentation at + If unsure, say N. -- cgit From 09001b03f722be96827bf8df5ba4d48b7ec0cc30 Mon Sep 17 00:00:00 2001 From: Wenhua Shi Date: Sat, 14 Oct 2017 18:51:36 +0200 Subject: net: fix typo in skbuff.c Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 16982de649b9..e62476beee95 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1896,7 +1896,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) } /* If we need update frag list, we are in troubles. - * Certainly, it possible to add an offset to skb data, + * Certainly, it is possible to add an offset to skb data, * but taking into account that pulling is expected to * be very rare operation, it is worth to fight against * further bloating skb head and crucify ourselves here instead. -- cgit From 5903f594935a3841137c86b9d5b75143a5b7121c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 13 Oct 2017 19:22:35 +0200 Subject: l2tp: check ps->sock before running pppol2tp_session_ioctl() When pppol2tp_session_ioctl() is called by pppol2tp_tunnel_ioctl(), the session may be unconnected. That is, it was created by pppol2tp_session_create() and hasn't been connected with pppol2tp_connect(). In this case, ps->sock is NULL, so we need to check for this case in order to avoid dereferencing a NULL pointer. Fixes: 309795f4bec2 ("l2tp: Add netlink control API for L2TP") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index bc6e8bfc5be4..f50452b919d5 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -988,6 +988,9 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, session->name, cmd, arg); sk = ps->sock; + if (!sk) + return -EBADR; + sock_hold(sk); switch (cmd) { -- cgit From fdf7cb4185b60c68e1a75e61691c4afdc15dea0e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Sep 2017 14:54:54 +0200 Subject: mac80211: accept key reinstall without changing anything When a key is reinstalled we can reset the replay counters etc. which can lead to nonce reuse and/or replay detection being impossible, breaking security properties, as described in the "KRACK attacks". In particular, CVE-2017-13080 applies to GTK rekeying that happened in firmware while the host is in D3, with the second part of the attack being done after the host wakes up. In this case, the wpa_supplicant mitigation isn't sufficient since wpa_supplicant doesn't know the GTK material. In case this happens, simply silently accept the new key coming from userspace but don't take any action on it since it's the same key; this keeps the PN replay counters intact. Signed-off-by: Johannes Berg --- net/mac80211/key.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index a98fc2b5e0dc..ae995c8480db 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007-2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2015 Intel Deutschland GmbH + * Copyright 2015-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -620,9 +620,6 @@ int ieee80211_key_link(struct ieee80211_key *key, pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; idx = key->conf.keyidx; - key->local = sdata->local; - key->sdata = sdata; - key->sta = sta; mutex_lock(&sdata->local->key_mtx); @@ -633,6 +630,21 @@ int ieee80211_key_link(struct ieee80211_key *key, else old_key = key_mtx_dereference(sdata->local, sdata->keys[idx]); + /* + * Silently accept key re-installation without really installing the + * new version of the key to avoid nonce reuse or replay issues. + */ + if (old_key && key->conf.keylen == old_key->conf.keylen && + !memcmp(key->conf.key, old_key->conf.key, key->conf.keylen)) { + ieee80211_key_free_unused(key); + ret = 0; + goto out; + } + + key->local = sdata->local; + key->sdata = sdata; + key->sta = sta; + increment_tailroom_need_count(sdata); ieee80211_key_replace(sdata, sta, pairwise, old_key, key); @@ -648,6 +660,7 @@ int ieee80211_key_link(struct ieee80211_key *key, ret = 0; } + out: mutex_unlock(&sdata->local->key_mtx); return ret; -- cgit From 4c625a974fb81724e60966b677e47fcba782c950 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 13 Oct 2017 14:08:55 -0400 Subject: SUNRPC: fix a list corruption issue in xprt_release() We remove the request from the receive list before we call xprt_wait_on_pinned_rqst(), and so we need to use list_del_init(). Otherwise, we will see list corruption when xprt_complete_rqst() is called. Reported-by: Emre Celebi Fixes: ce7c252a8c741 ("SUNRPC: Add a separate spinlock to protect...") Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e741ec2b4d8e..1a39ad14c42f 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1333,7 +1333,7 @@ void xprt_release(struct rpc_task *task) rpc_count_iostats(task, task->tk_client->cl_metrics); spin_lock(&xprt->recv_lock); if (!list_empty(&req->rq_list)) { - list_del(&req->rq_list); + list_del_init(&req->rq_list); xprt_wait_on_pinned_rqst(req); } spin_unlock(&xprt->recv_lock); -- cgit From 8a212589fe0e45f26c549dfa271a157ca8eea1ac Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 15 Oct 2017 18:13:41 +0800 Subject: rtnetlink: bring NETDEV_CHANGEMTU event process back in rtnetlink_event Commit 085e1a65f04f ("rtnetlink: Do not generate notifications for MTU events") tried to fix the redundant notifications issue when ip link set mtu by removing NETDEV_CHANGEMTU event process in rtnetlink_event. But it also resulted in no notification generated when dev's mtu is changed via other methods, like: 'ifconfig eth1 mtu 1400' or 'echo 1400 > /sys/class/net/eth1/mtu' It would cause users not to be notified by this change. This patch is to fix it by bringing NETDEV_CHANGEMTU event back into rtnetlink_event, and the redundant notifications issue will be fixed in the later patch 'rtnetlink: check DO_SETLINK_NOTIFY correctly in do_setlink'. Fixes: 085e1a65f04f ("rtnetlink: Do not generate notifications for MTU events") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d4bcdcc68e92..72053ed7c891 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4279,6 +4279,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi switch (event) { case NETDEV_REBOOT: + case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: -- cgit From ebdcf0450b020748c2dab6bfe44a5ac3c5159fb0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 15 Oct 2017 18:13:42 +0800 Subject: rtnetlink: bring NETDEV_CHANGE_TX_QUEUE_LEN event process back in rtnetlink_event The same fix for changing mtu in the patch 'rtnetlink: bring NETDEV_CHANGEMTU event process back in rtnetlink_event' is needed for changing tx_queue_len. Note that the redundant notifications issue for tx_queue_len will be fixed in the later patch 'rtnetlink: do not send notification for tx_queue_len in do_setlink'. Fixes: 27b3b551d8a7 ("rtnetlink: Do not generate notifications for NETDEV_CHANGE_TX_QUEUE_LEN event") Signed-off-by: Xin Long Acked-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 72053ed7c891..bf473604f33d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4287,6 +4287,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_NOTIFY_PEERS: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: + case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), GFP_KERNEL); break; -- cgit From e6e6659446c87057aede26a39d9f16b19001716f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 15 Oct 2017 18:13:43 +0800 Subject: rtnetlink: bring NETDEV_POST_TYPE_CHANGE event process back in rtnetlink_event As I said in patch 'rtnetlink: bring NETDEV_CHANGEMTU event process back in rtnetlink_event', removing NETDEV_POST_TYPE_CHANGE event was not the right fix for the redundant notifications issue. So bring this event process back to rtnetlink_event and the old redundant notifications issue would be fixed in the later patch 'rtnetlink: check DO_SETLINK_NOTIFY correctly in do_setlink'. Fixes: aef091ae58aa ("rtnetlink: Do not generate notifications for POST_TYPE_CHANGE event") Signed-off-by: Xin Long Acked-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bf473604f33d..8e44fd597f46 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4284,6 +4284,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: + case NETDEV_POST_TYPE_CHANGE: case NETDEV_NOTIFY_PEERS: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: -- cgit From dc709f375743ebf5c9326cc9b946f6f09a34ac44 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 15 Oct 2017 18:13:44 +0800 Subject: rtnetlink: bring NETDEV_CHANGEUPPER event process back in rtnetlink_event libteam needs this event notification in userspace when dev's master dev has been changed. After this, the redundant notifications issue would be fixed in the later patch 'rtnetlink: check DO_SETLINK_NOTIFY correctly in do_setlink'. Fixes: b6b36eb23a46 ("rtnetlink: Do not generate notifications for NETDEV_CHANGEUPPER event") Signed-off-by: Xin Long Acked-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8e44fd597f46..ab98c1c8b6f3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4286,6 +4286,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_BONDING_FAILOVER: case NETDEV_POST_TYPE_CHANGE: case NETDEV_NOTIFY_PEERS: + case NETDEV_CHANGEUPPER: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: case NETDEV_CHANGE_TX_QUEUE_LEN: -- cgit From 64ff90cc2e6f42596d7a0c37e41dc95292bb63b1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 15 Oct 2017 18:13:45 +0800 Subject: rtnetlink: check DO_SETLINK_NOTIFY correctly in do_setlink The check 'status & DO_SETLINK_NOTIFY' in do_setlink doesn't really work after status & DO_SETLINK_MODIFIED, as: DO_SETLINK_MODIFIED 0x1 DO_SETLINK_NOTIFY 0x3 Considering that notifications are suppposed to be sent only when status have the flag DO_SETLINK_NOTIFY, the right check would be: (status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY This would avoid lots of duplicated notifications when setting some properties of a link. Fixes: ba9989069f4e ("rtnl/do_setlink(): notify when a netdev is modified") Signed-off-by: Xin Long Acked-by: David Ahern Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ab98c1c8b6f3..3e98fb557598 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2248,7 +2248,7 @@ static int do_setlink(const struct sk_buff *skb, errout: if (status & DO_SETLINK_MODIFIED) { - if (status & DO_SETLINK_NOTIFY) + if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) netdev_state_change(dev); if (err < 0) -- cgit From 2d7f669b42a97022c8c2b6cd86f3990be5fcd1bc Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 15 Oct 2017 18:13:46 +0800 Subject: rtnetlink: do not set notification for tx_queue_len in do_setlink NETDEV_CHANGE_TX_QUEUE_LEN event process in rtnetlink_event would send a notification for userspace and tx_queue_len's setting in do_setlink would trigger NETDEV_CHANGE_TX_QUEUE_LEN. So it shouldn't set DO_SETLINK_NOTIFY status for this change to send a notification any more. Signed-off-by: Xin Long Acked-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3e98fb557598..a6bcf86ce471 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2093,7 +2093,7 @@ static int do_setlink(const struct sk_buff *skb, dev->tx_queue_len = orig_len; goto errout; } - status |= DO_SETLINK_NOTIFY; + status |= DO_SETLINK_MODIFIED; } } -- cgit From 2459b4c635858094df78abb9ca87d99f89fe8ca5 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 11 Oct 2017 16:24:48 +0200 Subject: net: enable interface alias removal via rtnl IFLA_IFALIAS is defined as NLA_STRING. It means that the minimal length of the attribute is 1 ("\0"). However, to remove an alias, the attribute length must be 0 (see dev_set_alias()). Let's define the type to NLA_BINARY to allow 0-length string, so that the alias can be removed. Example: $ ip l s dummy0 alias foo $ ip l l dev dummy0 5: dummy0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether ae:20:30:4f:a7:f3 brd ff:ff:ff:ff:ff:ff alias foo Before the patch: $ ip l s dummy0 alias "" RTNETLINK answers: Numerical result out of range After the patch: $ ip l s dummy0 alias "" $ ip l l dev dummy0 5: dummy0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether ae:20:30:4f:a7:f3 brd ff:ff:ff:ff:ff:ff CC: Oliver Hartkopp CC: Stephen Hemminger Fixes: 96ca4a2cc145 ("net: remove ifalias on empty given alias") Reported-by: Julien FLoret Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a6bcf86ce471..5ace48926b19 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1483,7 +1483,10 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_NET_NS_FD] = { .type = NLA_U32 }, - [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, + /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to + * allow 0-length string (needed to remove an alias). + */ + [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, -- cgit From 0ad646c81b2182f7fa67ec0c8c825e0ee165696d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 13 Oct 2017 11:58:53 -0700 Subject: tun: call dev_get_valid_name() before register_netdevice() register_netdevice() could fail early when we have an invalid dev name, in which case ->ndo_uninit() is not called. For tun device, this is a problem because a timer etc. are already initialized and it expects ->ndo_uninit() to clean them up. We could move these initializations into a ->ndo_init() so that register_netdevice() knows better, however this is still complicated due to the logic in tun_detach(). Therefore, I choose to just call dev_get_valid_name() before register_netdevice(), which is quicker and much easier to audit. And for this specific case, it is already enough. Fixes: 96442e42429e ("tuntap: choose the txq based on rxq") Reported-by: Dmitry Alexeev Cc: Jason Wang Cc: "Michael S. Tsirkin" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 588b473194a8..11596a302a26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1147,9 +1147,8 @@ static int dev_alloc_name_ns(struct net *net, return ret; } -static int dev_get_valid_name(struct net *net, - struct net_device *dev, - const char *name) +int dev_get_valid_name(struct net *net, struct net_device *dev, + const char *name) { BUG_ON(!net); @@ -1165,6 +1164,7 @@ static int dev_get_valid_name(struct net *net, return 0; } +EXPORT_SYMBOL(dev_get_valid_name); /** * dev_change_name - change name of a device -- cgit From c019b5166e11faaf9ed3b64316ed338eaa19de60 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 16 Oct 2017 12:19:48 +0300 Subject: net/sched: cls_flower: Set egress_dev mark when calling into the HW driver Commit 7091d8c '(net/sched: cls_flower: Add offload support using egress Hardware device') made sure (when fl_hw_replace_filter is called) to put the egress_dev mark on persisent structure instance. Hence, following calls into the HW driver for stats and deletion will note it and act accordingly. With commit de4784ca030f this property is lost and hence when called, the HW driver failes to operate (stats, delete) on the offloaded flow. Fix it by setting the egress_dev flag whenever the ingress device is different from the hw device since this is exactly the condition under which we're calling into the HW driver through the egress port net-device. Fixes: de4784ca030f ('net: sched: get rid of struct tc_to_netdev') Signed-off-by: Or Gerlitz Signed-off-by: Roi Dayan Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d230cb4c8094..b480d7c792ba 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -234,6 +234,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; + cls_flower.egress_dev = f->hw_dev != tp->q->dev_queue->dev; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); } @@ -289,6 +290,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) cls_flower.command = TC_CLSFLOWER_STATS; cls_flower.cookie = (unsigned long) f; cls_flower.exts = &f->exts; + cls_flower.egress_dev = f->hw_dev != tp->q->dev_queue->dev; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); -- cgit From 823038ca030e9f8283518b1e6a5a6879edcbe057 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 16 Oct 2017 19:43:15 +0800 Subject: dev_ioctl: add missing NETDEV_CHANGE_TX_QUEUE_LEN event notification When changing dev tx_queue_len via netlink or net-sysfs, a NETDEV_CHANGE_TX_QUEUE_LEN event notification will be called. But dev_ioctl missed this event notification, which could cause no userspace notification would be sent. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 709a4e6fb447..f9c7a88cd981 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -303,7 +303,18 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) return -EINVAL; - dev->tx_queue_len = ifr->ifr_qlen; + if (dev->tx_queue_len ^ ifr->ifr_qlen) { + unsigned int orig_len = dev->tx_queue_len; + + dev->tx_queue_len = ifr->ifr_qlen; + err = call_netdevice_notifiers( + NETDEV_CHANGE_TX_QUEUE_LEN, dev); + err = notifier_to_errno(err); + if (err) { + dev->tx_queue_len = orig_len; + return err; + } + } return 0; case SIOCSIFNAME: -- cgit From 2bdd713b92a9cade239d3c7d15205a09f556624d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 17 Oct 2017 20:32:07 +0200 Subject: mac80211: use constant time comparison with keys Otherwise we risk leaking information via timing side channel. Fixes: fdf7cb4185b6 ("mac80211: accept key reinstall without changing anything") Signed-off-by: Jason A. Donenfeld Signed-off-by: Johannes Berg --- net/mac80211/key.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index ae995c8480db..035d16fe926e 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "ieee80211_i.h" #include "driver-ops.h" @@ -635,7 +636,7 @@ int ieee80211_key_link(struct ieee80211_key *key, * new version of the key to avoid nonce reuse or replay issues. */ if (old_key && key->conf.keylen == old_key->conf.keylen && - !memcmp(key->conf.key, old_key->conf.key, key->conf.keylen)) { + !crypto_memneq(key->conf.key, old_key->conf.key, key->conf.keylen)) { ieee80211_key_free_unused(key); ret = 0; goto out; -- cgit From 51e13359cd5ea34acc62c90627603352956380af Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 17 Oct 2017 21:56:20 +0200 Subject: cfg80211: fix connect/disconnect edge cases If we try to connect while already connected/connecting, but this fails, we set ssid_len=0 but leave current_bss hanging, leading to errors. Check all of this better, first of all ensuring that we can't try to connect to a different SSID while connected/ing; ensure that prev_bssid is set for re-association attempts even in the case of the driver supporting the connect() method, and don't reset ssid_len in the failure cases. While at it, also reset ssid_len while disconnecting unless we were connected and expect a disconnected event, and warn on a successful connection without ssid_len being set. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/wireless/sme.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 0a49b88070d0..b6533ecbf5b1 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -522,11 +522,6 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, return -EOPNOTSUPP; if (wdev->current_bss) { - if (!prev_bssid) - return -EALREADY; - if (prev_bssid && - !ether_addr_equal(prev_bssid, wdev->current_bss->pub.bssid)) - return -ENOTCONN; cfg80211_unhold_bss(wdev->current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; @@ -1063,11 +1058,35 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, ASSERT_WDEV_LOCK(wdev); - if (WARN_ON(wdev->connect_keys)) { - kzfree(wdev->connect_keys); - wdev->connect_keys = NULL; + /* + * If we have an ssid_len, we're trying to connect or are + * already connected, so reject a new SSID unless it's the + * same (which is the case for re-association.) + */ + if (wdev->ssid_len && + (wdev->ssid_len != connect->ssid_len || + memcmp(wdev->ssid, connect->ssid, wdev->ssid_len))) + return -EALREADY; + + /* + * If connected, reject (re-)association unless prev_bssid + * matches the current BSSID. + */ + if (wdev->current_bss) { + if (!prev_bssid) + return -EALREADY; + if (!ether_addr_equal(prev_bssid, wdev->current_bss->pub.bssid)) + return -ENOTCONN; } + /* + * Reject if we're in the process of connecting with WEP, + * this case isn't very interesting and trying to handle + * it would make the code much more complex. + */ + if (wdev->connect_keys) + return -EINPROGRESS; + cfg80211_oper_and_ht_capa(&connect->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); @@ -1118,7 +1137,12 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, if (err) { wdev->connect_keys = NULL; - wdev->ssid_len = 0; + /* + * This could be reassoc getting refused, don't clear + * ssid_len in that case. + */ + if (!wdev->current_bss) + wdev->ssid_len = 0; return err; } @@ -1145,6 +1169,14 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, else if (wdev->ssid_len) err = rdev_disconnect(rdev, dev, reason); + /* + * Clear ssid_len unless we actually were fully connected, + * in which case cfg80211_disconnected() will take care of + * this later. + */ + if (!wdev->current_bss) + wdev->ssid_len = 0; + return err; } -- cgit From e5f5ce37a7918ed7406c52987c7cc8b670ed5e14 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 18 Oct 2017 09:36:51 +0200 Subject: mac80211: validate user rate mask before configuring driver Ben reported that when the user rate mask is rejected for not matching any basic rate, the driver had already been configured. This is clearly an oversight in my original change, fix this by doing the validation before calling the driver. Reported-by: Ben Greear Fixes: e8e4f5280ddd ("mac80211: reject/clear user rate mask if not usable") Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index a354f1939e49..fb15d3b97cb2 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2727,12 +2727,6 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; - if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { - ret = drv_set_bitrate_mask(local, sdata, mask); - if (ret) - return ret; - } - /* * If active validate the setting and reject it if it doesn't leave * at least one basic rate usable, since we really have to be able @@ -2748,6 +2742,12 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, return -EINVAL; } + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { + ret = drv_set_bitrate_mask(local, sdata, mask); + if (ret) + return ret; + } + for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband = wiphy->bands[i]; int j; -- cgit From 363b02dab09b3226f3bd1420dad9c72b79a42a76 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Oct 2017 16:43:25 +0100 Subject: KEYS: Fix race between updating and finding a negative key Consolidate KEY_FLAG_INSTANTIATED, KEY_FLAG_NEGATIVE and the rejection error into one field such that: (1) The instantiation state can be modified/read atomically. (2) The error can be accessed atomically with the state. (3) The error isn't stored unioned with the payload pointers. This deals with the problem that the state is spread over three different objects (two bits and a separate variable) and reading or updating them atomically isn't practical, given that not only can uninstantiated keys change into instantiated or rejected keys, but rejected keys can also turn into instantiated keys - and someone accessing the key might not be using any locking. The main side effect of this problem is that what was held in the payload may change, depending on the state. For instance, you might observe the key to be in the rejected state. You then read the cached error, but if the key semaphore wasn't locked, the key might've become instantiated between the two reads - and you might now have something in hand that isn't actually an error code. The state is now KEY_IS_UNINSTANTIATED, KEY_IS_POSITIVE or a negative error code if the key is negatively instantiated. The key_is_instantiated() function is replaced with key_is_positive() to avoid confusion as negative keys are also 'instantiated'. Additionally, barriering is included: (1) Order payload-set before state-set during instantiation. (2) Order state-read before payload-read when using the key. Further separate barriering is necessary if RCU is being used to access the payload content after reading the payload pointers. Fixes: 146aa8b1453b ("KEYS: Merge the type-specific data with the payload data") Cc: stable@vger.kernel.org # v4.4+ Reported-by: Eric Biggers Signed-off-by: David Howells Reviewed-by: Eric Biggers --- net/dns_resolver/dns_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 8737412c7b27..e1d4d898a007 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -224,7 +224,7 @@ static int dns_resolver_match_preparse(struct key_match_data *match_data) static void dns_resolver_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); - if (key_is_instantiated(key)) { + if (key_is_positive(key)) { int err = PTR_ERR(key->payload.data[dns_key_error]); if (err) -- cgit From 48044eb490be71c203e14dd89e8bae87209eab52 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 16 Oct 2017 17:09:53 +0200 Subject: netlink: fix netlink_ack() extack race It seems that it's possible to toggle NETLINK_F_EXT_ACK through setsockopt() while another thread/CPU is building a message inside netlink_ack(), which could then trigger the WARN_ON()s I added since if it goes from being turned off to being turned on between allocating and filling the message, the skb could end up being too small. Avoid this whole situation by storing the value of this flag in a separate variable and using that throughout the function instead. Fixes: 2d4bc93368f5 ("netlink: extended ACK reporting") Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f34750691c5c..b93148e8e9fb 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2307,6 +2307,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, size_t tlvlen = 0; struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); unsigned int flags = 0; + bool nlk_has_extack = nlk->flags & NETLINK_F_EXT_ACK; /* Error messages get the original request appened, unless the user * requests to cap the error message, and get extra error data if @@ -2317,7 +2318,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, payload += nlmsg_len(nlh); else flags |= NLM_F_CAPPED; - if (nlk->flags & NETLINK_F_EXT_ACK && extack) { + if (nlk_has_extack && extack) { if (extack->_msg) tlvlen += nla_total_size(strlen(extack->_msg) + 1); if (extack->bad_attr) @@ -2326,8 +2327,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, } else { flags |= NLM_F_CAPPED; - if (nlk->flags & NETLINK_F_EXT_ACK && - extack && extack->cookie_len) + if (nlk_has_extack && extack && extack->cookie_len) tlvlen += nla_total_size(extack->cookie_len); } @@ -2355,7 +2355,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, errmsg->error = err; memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); - if (nlk->flags & NETLINK_F_EXT_ACK && extack) { + if (nlk_has_extack && extack) { if (err) { if (extack->_msg) WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, -- cgit From 62c04647c6f44fa3d5d0c077133da0aa1cbbc34c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Sep 2017 16:02:35 +0100 Subject: can: bcm: check for null sk before deferencing it via the call to sock_net The assignment of net via call sock_net will dereference sk. This is performed before a sanity null check on sk, so there could be a potential null dereference on the sock_net call if sk is null. Fix this by assigning net after the sk null check. Also replace the sk == NULL with the more usual !sk idiom. Detected by CoverityScan CID#1431862 ("Dereference before null check") Fixes: 384317ef4187 ("can: network namespace support for CAN_BCM protocol") Signed-off-by: Colin Ian King Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index 47a8748d953a..13690334efa3 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1493,13 +1493,14 @@ static int bcm_init(struct sock *sk) static int bcm_release(struct socket *sock) { struct sock *sk = sock->sk; - struct net *net = sock_net(sk); + struct net *net; struct bcm_sock *bo; struct bcm_op *op, *next; - if (sk == NULL) + if (!sk) return 0; + net = sock_net(sk); bo = bcm_sk(sk); /* remove bcm_ops, timer, rx_unregister(), etc. */ -- cgit From cae1d5b78fb4874086170ad07921bca59ea2e893 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 17 Oct 2017 07:18:35 +0200 Subject: can: af_can: do not access proto_tab directly use rcu_access_pointer instead "proto_tab" is a RCU protected array, when directly accessing the array, sparse throws these warnings: CHECK /srv/work/frogger/socketcan/linux/net/can/af_can.c net/can/af_can.c:115:14: error: incompatible types in comparison expression (different address spaces) net/can/af_can.c:795:17: error: incompatible types in comparison expression (different address spaces) net/can/af_can.c:816:9: error: incompatible types in comparison expression (different address spaces) This patch fixes the problem by using rcu_access_pointer() and annotating "proto_tab" array as __rcu. Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 88edac0f3e36..eb1ad74b40f4 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -78,7 +78,7 @@ MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); static struct kmem_cache *rcv_cache __read_mostly; /* table of registered CAN protocols */ -static const struct can_proto *proto_tab[CAN_NPROTO] __read_mostly; +static const struct can_proto __rcu *proto_tab[CAN_NPROTO] __read_mostly; static DEFINE_MUTEX(proto_tab_lock); static atomic_t skbcounter = ATOMIC_INIT(0); @@ -788,7 +788,7 @@ int can_proto_register(const struct can_proto *cp) mutex_lock(&proto_tab_lock); - if (proto_tab[proto]) { + if (rcu_access_pointer(proto_tab[proto])) { pr_err("can: protocol %d already registered\n", proto); err = -EBUSY; } else @@ -812,7 +812,7 @@ void can_proto_unregister(const struct can_proto *cp) int proto = cp->protocol; mutex_lock(&proto_tab_lock); - BUG_ON(proto_tab[proto] != cp); + BUG_ON(rcu_access_pointer(proto_tab[proto]) != cp); RCU_INIT_POINTER(proto_tab[proto], NULL); mutex_unlock(&proto_tab_lock); -- cgit From 5a606223c6b5b7560da253ed52e62c67fa18e29b Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Sat, 29 Jul 2017 11:51:01 +0200 Subject: can: af_can: can_pernet_init(): add missing error handling for kzalloc returning NULL This patch adds the missing check and error handling for out-of-memory situations, when kzalloc cannot allocate memory. Fixes: cb5635a36776 ("can: complete initial namespace support") Acked-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index eb1ad74b40f4..ecd5c703d11e 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -875,9 +875,14 @@ static int can_pernet_init(struct net *net) spin_lock_init(&net->can.can_rcvlists_lock); net->can.can_rx_alldev_list = kzalloc(sizeof(struct dev_rcv_lists), GFP_KERNEL); - + if (!net->can.can_rx_alldev_list) + goto out; net->can.can_stats = kzalloc(sizeof(struct s_stats), GFP_KERNEL); + if (!net->can.can_stats) + goto out_free_alldev_list; net->can.can_pstats = kzalloc(sizeof(struct s_pstats), GFP_KERNEL); + if (!net->can.can_pstats) + goto out_free_can_stats; if (IS_ENABLED(CONFIG_PROC_FS)) { /* the statistics are updated every second (timer triggered) */ @@ -892,6 +897,13 @@ static int can_pernet_init(struct net *net) } return 0; + + out_free_can_stats: + kfree(net->can.can_stats); + out_free_alldev_list: + kfree(net->can.can_rx_alldev_list); + out: + return -ENOMEM; } static void can_pernet_exit(struct net *net) -- cgit From df80cd9b28b9ebaa284a41df611dbf3a2d05ca74 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 17 Oct 2017 23:26:10 +0800 Subject: sctp: do not peel off an assoc from one netns to another one Now when peeling off an association to the sock in another netns, all transports in this assoc are not to be rehashed and keep use the old key in hashtable. As a transport uses sk->net as the hash key to insert into hashtable, it would miss removing these transports from hashtable due to the new netns when closing the sock and all transports are being freeed, then later an use-after-free issue could be caused when looking up an asoc and dereferencing those transports. This is a very old issue since very beginning, ChunYu found it with syzkaller fuzz testing with this series: socket$inet6_sctp() bind$inet6() sendto$inet6() unshare(0x40000000) getsockopt$inet_sctp6_SCTP_GET_ASSOC_ID_LIST() getsockopt$inet_sctp6_SCTP_SOCKOPT_PEELOFF() This patch is to block this call when peeling one assoc off from one netns to another one, so that the netns of all transport would not go out-sync with the key in hashtable. Note that this patch didn't fix it by rehashing transports, as it's difficult to handle the situation when the tuple is already in use in the new netns. Besides, no one would like to peel off one assoc to another netns, considering ipaddrs, ifaces, etc. are usually different. Reported-by: ChunYu Wang Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d4730ada7f32..17841ab30798 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4906,6 +4906,10 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) struct socket *sock; int err = 0; + /* Do not peel off from one netns to another one. */ + if (!net_eq(current->nsproxy->net_ns, sock_net(sk))) + return -EINVAL; + if (!asoc) return -EINVAL; -- cgit From 528fd3547bad0bdd31c8f987e5bd00c83df8af39 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 19 Oct 2017 12:13:10 -0400 Subject: SUNRPC: Destroy transport from the system workqueue The transport may need to flush transport connect and receive tasks that are running on rpciod. In order to do so safely, we need to ensure that the caller of cancel_work_sync() etc is not itself running on rpciod. Do so by running the destroy task from the system workqueue. Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1a39ad14c42f..898485e3ece4 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1445,6 +1445,23 @@ out: return xprt; } +static void xprt_destroy_cb(struct work_struct *work) +{ + struct rpc_xprt *xprt = + container_of(work, struct rpc_xprt, task_cleanup); + + rpc_xprt_debugfs_unregister(xprt); + rpc_destroy_wait_queue(&xprt->binding); + rpc_destroy_wait_queue(&xprt->pending); + rpc_destroy_wait_queue(&xprt->sending); + rpc_destroy_wait_queue(&xprt->backlog); + kfree(xprt->servername); + /* + * Tear down transport state and free the rpc_xprt + */ + xprt->ops->destroy(xprt); +} + /** * xprt_destroy - destroy an RPC transport, killing off all requests. * @xprt: transport to destroy @@ -1454,22 +1471,19 @@ static void xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); - /* Exclude transport connect/disconnect handlers */ + /* + * Exclude transport connect/disconnect handlers and autoclose + */ wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_UNINTERRUPTIBLE); del_timer_sync(&xprt->timer); - rpc_xprt_debugfs_unregister(xprt); - rpc_destroy_wait_queue(&xprt->binding); - rpc_destroy_wait_queue(&xprt->pending); - rpc_destroy_wait_queue(&xprt->sending); - rpc_destroy_wait_queue(&xprt->backlog); - cancel_work_sync(&xprt->task_cleanup); - kfree(xprt->servername); /* - * Tear down transport state and free the rpc_xprt + * Destroy sockets etc from the system workqueue so they can + * safely flush receive work running on rpciod. */ - xprt->ops->destroy(xprt); + INIT_WORK(&xprt->task_cleanup, xprt_destroy_cb); + schedule_work(&xprt->task_cleanup); } static void xprt_destroy_kref(struct kref *kref) -- cgit From 1cc276cec9ec574d41cf47dfc0f51406b6f26ab4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 18 Oct 2017 21:37:49 +0800 Subject: sctp: add the missing sock_owned_by_user check in sctp_icmp_redirect Now sctp processes icmp redirect packet in sctp_icmp_redirect where it calls sctp_transport_dst_check in which tp->dst can be released. The problem is before calling sctp_transport_dst_check, it doesn't check sock_owned_by_user, which means tp->dst could be freed while a process is accessing it with owning the socket. An use-after-free issue could be triggered by this. This patch is to fix it by checking sock_owned_by_user before calling sctp_transport_dst_check in sctp_icmp_redirect, so that it would not release tp->dst if users still hold sock lock. Besides, the same issue fixed in commit 45caeaa5ac0b ("dccp/tcp: fix routing redirect race") on sctp also needs this check. Fixes: 55be7a9c6074 ("ipv4: Add redirect support to all protocol icmp error handlers") Reported-by: Eric Dumazet Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 92a07141fd07..34f10e75f3b9 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -421,7 +421,7 @@ void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t, { struct dst_entry *dst; - if (!t) + if (sock_owned_by_user(sk) || !t) return; dst = sctp_transport_dst_check(t); if (dst) -- cgit From 34f79502bbcfab659b8729da68b5e387f96eb4c1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 18 Oct 2017 07:10:36 -0700 Subject: bpf: avoid preempt enable/disable in sockmap using tcp_skb_cb region SK_SKB BPF programs are run from the socket/tcp context but early in the stack before much of the TCP metadata is needed in tcp_skb_cb. So we can use some unused fields to place BPF metadata needed for SK_SKB programs when implementing the redirect function. This allows us to drop the preempt disable logic. It does however require an API change so sk_redirect_map() has been updated to additionally provide ctx_ptr to skb. Note, we do however continue to disable/enable preemption around actual BPF program running to account for map updates. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 74b8c91fb5f4..ca1ba0bbfbc2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1839,31 +1839,31 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags) +BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, + struct bpf_map *, map, u32, key, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); if (unlikely(flags)) return SK_ABORTED; - ri->ifindex = key; - ri->flags = flags; - ri->map = map; + tcb->bpf.key = key; + tcb->bpf.flags = flags; + tcb->bpf.map = map; return SK_REDIRECT; } -struct sock *do_sk_redirect_map(void) +struct sock *do_sk_redirect_map(struct sk_buff *skb) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk = NULL; - if (ri->map) { - sk = __sock_map_lookup_elem(ri->map, ri->ifindex); + if (tcb->bpf.map) { + sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); - ri->ifindex = 0; - ri->map = NULL; - /* we do not clear flags for future lookup */ + tcb->bpf.key = 0; + tcb->bpf.map = NULL; } return sk; @@ -1873,9 +1873,10 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) -- cgit From f7e9cb1ecb6d922584abff16db07930162c57155 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 18 Oct 2017 07:10:58 -0700 Subject: bpf: remove mark access for SK_SKB program types The skb->mark field is a union with reserved_tailroom which is used in the TCP code paths from stream memory allocation. Allowing SK_SKB programs to set this field creates a conflict with future code optimizations, such as "gifting" the skb to the egress path instead of creating a new skb and doing a memcpy. Because we do not have a released version of SK_SKB yet lets just remove it for now. A more appropriate scratch pad to use at the socket layer is dev_scratch, but lets add that in future kernels when needed. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index ca1ba0bbfbc2..aa0265997f93 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3684,7 +3684,6 @@ static bool sk_skb_is_valid_access(int off, int size, { if (type == BPF_WRITE) { switch (off) { - case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): break; @@ -3694,6 +3693,7 @@ static bool sk_skb_is_valid_access(int off, int size, } switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, tc_classid): return false; case bpf_ctx_range(struct __sk_buff, data): -- cgit From c92e8c02fe664155ac4234516e32544bec0f113d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2017 09:04:13 -0700 Subject: tcp/dccp: fix ireq->opt races syzkaller found another bug in DCCP/TCP stacks [1] For the reasons explained in commit ce1050089c96 ("tcp/dccp: fix ireq->pktopts race"), we need to make sure we do not access ireq->opt unless we own the request sock. Note the opt field is renamed to ireq_opt to ease grep games. [1] BUG: KASAN: use-after-free in ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474 Read of size 1 at addr ffff8801c951039c by task syz-executor5/3295 CPU: 1 PID: 3295 Comm: syz-executor5 Not tainted 4.14.0-rc4+ #80 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:427 ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474 tcp_transmit_skb+0x1ab7/0x3840 net/ipv4/tcp_output.c:1135 tcp_send_ack.part.37+0x3bb/0x650 net/ipv4/tcp_output.c:3587 tcp_send_ack+0x49/0x60 net/ipv4/tcp_output.c:3557 __tcp_ack_snd_check+0x2c6/0x4b0 net/ipv4/tcp_input.c:5072 tcp_ack_snd_check net/ipv4/tcp_input.c:5085 [inline] tcp_rcv_state_process+0x2eff/0x4850 net/ipv4/tcp_input.c:6071 tcp_child_process+0x342/0x990 net/ipv4/tcp_minisocks.c:816 tcp_v4_rcv+0x1827/0x2f80 net/ipv4/tcp_ipv4.c:1682 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:249 [inline] ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:464 [inline] ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:249 [inline] ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493 __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514 netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587 netif_receive_skb+0xae/0x390 net/core/dev.c:4611 tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372 tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766 tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792 call_write_iter include/linux/fs.h:1770 [inline] new_sync_write fs/read_write.c:468 [inline] __vfs_write+0x68a/0x970 fs/read_write.c:481 vfs_write+0x18f/0x510 fs/read_write.c:543 SYSC_write fs/read_write.c:588 [inline] SyS_write+0xef/0x220 fs/read_write.c:580 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x40c341 RSP: 002b:00007f469523ec10 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 000000000040c341 RDX: 0000000000000037 RSI: 0000000020004000 RDI: 0000000000000015 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000000f4240 R11: 0000000000000293 R12: 00000000004b7fd1 R13: 00000000ffffffff R14: 0000000020000000 R15: 0000000000025000 Allocated by task 3295: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 __do_kmalloc mm/slab.c:3725 [inline] __kmalloc+0x162/0x760 mm/slab.c:3734 kmalloc include/linux/slab.h:498 [inline] tcp_v4_save_options include/net/tcp.h:1962 [inline] tcp_v4_init_req+0x2d3/0x3e0 net/ipv4/tcp_ipv4.c:1271 tcp_conn_request+0xf6d/0x3410 net/ipv4/tcp_input.c:6283 tcp_v4_conn_request+0x157/0x210 net/ipv4/tcp_ipv4.c:1313 tcp_rcv_state_process+0x8ea/0x4850 net/ipv4/tcp_input.c:5857 tcp_v4_do_rcv+0x55c/0x7d0 net/ipv4/tcp_ipv4.c:1482 tcp_v4_rcv+0x2d10/0x2f80 net/ipv4/tcp_ipv4.c:1711 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:249 [inline] ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:464 [inline] ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:249 [inline] ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493 __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514 netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587 netif_receive_skb+0xae/0x390 net/core/dev.c:4611 tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372 tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766 tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792 call_write_iter include/linux/fs.h:1770 [inline] new_sync_write fs/read_write.c:468 [inline] __vfs_write+0x68a/0x970 fs/read_write.c:481 vfs_write+0x18f/0x510 fs/read_write.c:543 SYSC_write fs/read_write.c:588 [inline] SyS_write+0xef/0x220 fs/read_write.c:580 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 3306: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3503 [inline] kfree+0xca/0x250 mm/slab.c:3820 inet_sock_destruct+0x59d/0x950 net/ipv4/af_inet.c:157 __sk_destruct+0xfd/0x910 net/core/sock.c:1560 sk_destruct+0x47/0x80 net/core/sock.c:1595 __sk_free+0x57/0x230 net/core/sock.c:1603 sk_free+0x2a/0x40 net/core/sock.c:1614 sock_put include/net/sock.h:1652 [inline] inet_csk_complete_hashdance+0xd5/0xf0 net/ipv4/inet_connection_sock.c:959 tcp_check_req+0xf4d/0x1620 net/ipv4/tcp_minisocks.c:765 tcp_v4_rcv+0x17f6/0x2f80 net/ipv4/tcp_ipv4.c:1675 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:249 [inline] ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:464 [inline] ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:249 [inline] ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493 __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514 netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587 netif_receive_skb+0xae/0x390 net/core/dev.c:4611 tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372 tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766 tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792 call_write_iter include/linux/fs.h:1770 [inline] new_sync_write fs/read_write.c:468 [inline] __vfs_write+0x68a/0x970 fs/read_write.c:481 vfs_write+0x18f/0x510 fs/read_write.c:543 SYSC_write fs/read_write.c:588 [inline] SyS_write+0xef/0x220 fs/read_write.c:580 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 13 ++++++++----- net/ipv4/cipso_ipv4.c | 24 +++++++----------------- net/ipv4/inet_connection_sock.c | 8 +++----- net/ipv4/syncookies.c | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 22 +++++++++++++--------- 6 files changed, 33 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 001c08696334..0490916864f9 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -414,8 +414,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, sk_daddr_set(newsk, ireq->ir_rmt_addr); sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newinet->inet_saddr = ireq->ir_loc_addr; - newinet->inet_opt = ireq->opt; - ireq->opt = NULL; + RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->inet_id = jiffies; @@ -430,7 +429,10 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); - + if (*own_req) + ireq->ireq_opt = NULL; + else + newinet->inet_opt = NULL; return newsk; exit_overflow: @@ -441,6 +443,7 @@ exit: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: + newinet->inet_opt = NULL; inet_csk_prepare_forced_close(newsk); dccp_done(newsk); goto exit; @@ -492,7 +495,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req ireq->ir_rmt_addr); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq->opt); + rcu_dereference(ireq->ireq_opt)); err = net_xmit_eval(err); } @@ -548,7 +551,7 @@ out: static void dccp_v4_reqsk_destructor(struct request_sock *req) { dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(inet_rsk(req)->opt); + kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); } void dccp_syn_ack_timeout(const struct request_sock *req) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2ae8f54cb321..82178cc69c96 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1951,7 +1951,7 @@ int cipso_v4_req_setattr(struct request_sock *req, buf = NULL; req_inet = inet_rsk(req); - opt = xchg(&req_inet->opt, opt); + opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt); if (opt) kfree_rcu(opt, rcu); @@ -1973,11 +1973,13 @@ req_setattr_failure: * values on failure. * */ -static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) +static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) { + struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1); int hdr_delta = 0; - struct ip_options_rcu *opt = *opt_ptr; + if (!opt || opt->opt.cipso == 0) + return 0; if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { u8 cipso_len; u8 cipso_off; @@ -2039,14 +2041,10 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) */ void cipso_v4_sock_delattr(struct sock *sk) { - int hdr_delta; - struct ip_options_rcu *opt; struct inet_sock *sk_inet; + int hdr_delta; sk_inet = inet_sk(sk); - opt = rcu_dereference_protected(sk_inet->inet_opt, 1); - if (!opt || opt->opt.cipso == 0) - return; hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); if (sk_inet->is_icsk && hdr_delta > 0) { @@ -2066,15 +2064,7 @@ void cipso_v4_sock_delattr(struct sock *sk) */ void cipso_v4_req_delattr(struct request_sock *req) { - struct ip_options_rcu *opt; - struct inet_request_sock *req_inet; - - req_inet = inet_rsk(req); - opt = req_inet->opt; - if (!opt || opt->opt.cipso == 0) - return; - - cipso_v4_delopt(&req_inet->opt); + cipso_v4_delopt(&inet_rsk(req)->ireq_opt); } /** diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 67aec7a10686..5ec9136a7c36 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -540,9 +540,10 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); - struct ip_options_rcu *opt = ireq->opt; + struct ip_options_rcu *opt; struct rtable *rt; + opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), @@ -576,10 +577,9 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct flowi4 *fl4; struct rtable *rt; + opt = rcu_dereference(ireq->ireq_opt); fl4 = &newinet->cork.fl.u.ip4; - rcu_read_lock(); - opt = rcu_dereference(newinet->inet_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), @@ -592,13 +592,11 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; - rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: - rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b1bb1b3a1082..77cf32a80952 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -355,7 +355,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ - ireq->opt = tcp_v4_save_options(sock_net(sk), skb); + RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb)); if (security_inet_conn_request(sk, skb, req)) { reqsk_free(req); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c5d7656beeee..7eec3383702b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6196,7 +6196,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, struct inet_request_sock *ireq = inet_rsk(req); kmemcheck_annotate_bitfield(ireq, flags); - ireq->opt = NULL; + ireq->ireq_opt = NULL; #if IS_ENABLED(CONFIG_IPV6) ireq->pktopts = NULL; #endif diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 85164d4d3e53..4c43365c374c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -877,7 +877,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq->opt); + rcu_dereference(ireq->ireq_opt)); err = net_xmit_eval(err); } @@ -889,7 +889,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, */ static void tcp_v4_reqsk_destructor(struct request_sock *req) { - kfree(inet_rsk(req)->opt); + kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); } #ifdef CONFIG_TCP_MD5SIG @@ -1265,10 +1265,11 @@ static void tcp_v4_init_req(struct request_sock *req, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); + struct net *net = sock_net(sk_listener); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb); + RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, @@ -1355,10 +1356,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, sk_daddr_set(newsk, ireq->ir_rmt_addr); sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newsk->sk_bound_dev_if = ireq->ir_iif; - newinet->inet_saddr = ireq->ir_loc_addr; - inet_opt = ireq->opt; - rcu_assign_pointer(newinet->inet_opt, inet_opt); - ireq->opt = NULL; + newinet->inet_saddr = ireq->ir_loc_addr; + inet_opt = rcu_dereference(ireq->ireq_opt); + RCU_INIT_POINTER(newinet->inet_opt, inet_opt); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; @@ -1403,9 +1403,12 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); - if (*own_req) + if (likely(*own_req)) { tcp_move_syn(newtp, req); - + ireq->ireq_opt = NULL; + } else { + newinet->inet_opt = NULL; + } return newsk; exit_overflow: @@ -1416,6 +1419,7 @@ exit: tcp_listendrop(sk); return NULL; put_and_exit: + newinet->inet_opt = NULL; inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto exit; -- cgit From 509c7a1ecc8601f94ffba8a00889fefb239c00c6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Oct 2017 16:14:52 -0700 Subject: packet: avoid panic in packet_getsockopt() syzkaller got crashes in packet_getsockopt() processing PACKET_ROLLOVER_STATS command while another thread was managing to change po->rollover Using RCU will fix this bug. We might later add proper RCU annotations for sparse sake. In v2: I replaced kfree(rollover) in fanout_add() to kfree_rcu() variant, as spotted by John. Fixes: a9b6391814d5 ("packet: rollover statistics") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: John Sperbeck Signed-off-by: David S. Miller --- net/packet/af_packet.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index bec01a3daf5b..2986941164b1 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1769,7 +1769,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) out: if (err && rollover) { - kfree(rollover); + kfree_rcu(rollover, rcu); po->rollover = NULL; } mutex_unlock(&fanout_mutex); @@ -1796,8 +1796,10 @@ static struct packet_fanout *fanout_release(struct sock *sk) else f = NULL; - if (po->rollover) + if (po->rollover) { kfree_rcu(po->rollover, rcu); + po->rollover = NULL; + } } mutex_unlock(&fanout_mutex); @@ -3851,6 +3853,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; + struct packet_rollover *rollover; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3929,13 +3932,18 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, 0); break; case PACKET_ROLLOVER_STATS: - if (!po->rollover) + rcu_read_lock(); + rollover = rcu_dereference(po->rollover); + if (rollover) { + rstats.tp_all = atomic_long_read(&rollover->num); + rstats.tp_huge = atomic_long_read(&rollover->num_huge); + rstats.tp_failed = atomic_long_read(&rollover->num_failed); + data = &rstats; + lv = sizeof(rstats); + } + rcu_read_unlock(); + if (!rollover) return -EINVAL; - rstats.tp_all = atomic_long_read(&po->rollover->num); - rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); - rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); - data = &rstats; - lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: val = po->tp_tx_has_off; -- cgit From 6850d0f8b2542112629061808ed950b35eb982e4 Mon Sep 17 00:00:00 2001 From: Samuel Mendoza-Jonas Date: Thu, 19 Oct 2017 13:43:05 +1100 Subject: net/ncsi: Fix AEN HNCDSC packet length Correct the value of the HNCDSC AEN packet. Fixes: 7a82ecf4cfb85 "net/ncsi: NCSI AEN packet handler" Signed-off-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- net/ncsi/ncsi-aen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index 6898e7229285..f135938bf781 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -187,7 +187,7 @@ static struct ncsi_aen_handler { } ncsi_aen_handlers[] = { { NCSI_PKT_AEN_LSC, 12, ncsi_aen_handler_lsc }, { NCSI_PKT_AEN_CR, 4, ncsi_aen_handler_cr }, - { NCSI_PKT_AEN_HNCDSC, 4, ncsi_aen_handler_hncdsc } + { NCSI_PKT_AEN_HNCDSC, 8, ncsi_aen_handler_hncdsc } }; int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb) -- cgit From 0795fb2021f07969949f523ea33c39785bfae9d6 Mon Sep 17 00:00:00 2001 From: Samuel Mendoza-Jonas Date: Thu, 19 Oct 2017 13:43:06 +1100 Subject: net/ncsi: Stop monitor if channel times out or is inactive ncsi_channel_monitor() misses stopping the channel monitor in several places that it should, causing a WARN_ON_ONCE() to trigger when the monitor is re-started later, eg: [ 459.040000] WARNING: CPU: 0 PID: 1093 at net/ncsi/ncsi-manage.c:269 ncsi_start_channel_monitor+0x7c/0x90 [ 459.040000] CPU: 0 PID: 1093 Comm: kworker/0:3 Not tainted 4.10.17-gaca2fdd #140 [ 459.040000] Hardware name: ASpeed SoC [ 459.040000] Workqueue: events ncsi_dev_work [ 459.040000] [<80010094>] (unwind_backtrace) from [<8000d950>] (show_stack+0x20/0x24) [ 459.040000] [<8000d950>] (show_stack) from [<801dbf70>] (dump_stack+0x20/0x28) [ 459.040000] [<801dbf70>] (dump_stack) from [<80018d7c>] (__warn+0xe0/0x108) [ 459.040000] [<80018d7c>] (__warn) from [<80018e70>] (warn_slowpath_null+0x30/0x38) [ 459.040000] [<80018e70>] (warn_slowpath_null) from [<803f6a08>] (ncsi_start_channel_monitor+0x7c/0x90) [ 459.040000] [<803f6a08>] (ncsi_start_channel_monitor) from [<803f7664>] (ncsi_configure_channel+0xdc/0x5fc) [ 459.040000] [<803f7664>] (ncsi_configure_channel) from [<803f8160>] (ncsi_dev_work+0xac/0x474) [ 459.040000] [<803f8160>] (ncsi_dev_work) from [<8002d244>] (process_one_work+0x1e0/0x450) [ 459.040000] [<8002d244>] (process_one_work) from [<8002d510>] (worker_thread+0x5c/0x570) [ 459.040000] [<8002d510>] (worker_thread) from [<80033614>] (kthread+0x124/0x164) [ 459.040000] [<80033614>] (kthread) from [<8000a5e8>] (ret_from_fork+0x14/0x2c) This also updates the monitor instead of just returning if ncsi_xmit_cmd() fails to send the get-link-status command so that the monitor properly times out. Fixes: e6f44ed6d04d3 "net/ncsi: Package and channel management" Signed-off-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- net/ncsi/ncsi-manage.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index b6a449aa9d4b..b022deb39d31 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -202,11 +202,15 @@ static void ncsi_channel_monitor(unsigned long data) monitor_state = nc->monitor.state; spin_unlock_irqrestore(&nc->lock, flags); - if (!enabled || chained) + if (!enabled || chained) { + ncsi_stop_channel_monitor(nc); return; + } if (state != NCSI_CHANNEL_INACTIVE && - state != NCSI_CHANNEL_ACTIVE) + state != NCSI_CHANNEL_ACTIVE) { + ncsi_stop_channel_monitor(nc); return; + } switch (monitor_state) { case NCSI_CHANNEL_MONITOR_START: @@ -217,12 +221,9 @@ static void ncsi_channel_monitor(unsigned long data) nca.type = NCSI_PKT_CMD_GLS; nca.req_flags = 0; ret = ncsi_xmit_cmd(&nca); - if (ret) { + if (ret) netdev_err(ndp->ndev.dev, "Error %d sending GLS\n", ret); - return; - } - break; case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX: break; @@ -233,6 +234,8 @@ static void ncsi_channel_monitor(unsigned long data) ndp->flags |= NCSI_DEV_RESHUFFLE; } + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&nc->lock, flags); nc->state = NCSI_CHANNEL_INVISIBLE; spin_unlock_irqrestore(&nc->lock, flags); -- cgit From 100ef01f3ea4badbee6479290a41f74abd0e523f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 19 Oct 2017 13:43:07 +1100 Subject: net/ncsi: Disable HWA mode when no channels are found When there are no NCSI channels probed, HWA (Hardware Arbitration) mode is enabled. It's not correct because HWA depends on the fact: NCSI channels exist and all of them support HWA mode. This disables HWA when no channels are probed. Signed-off-by: Gavin Shan Signed-off-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- net/ncsi/ncsi-manage.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index b022deb39d31..0966eff48ce7 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1005,12 +1005,15 @@ static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp) struct ncsi_package *np; struct ncsi_channel *nc; unsigned int cap; + bool has_channel = false; /* The hardware arbitration is disabled if any one channel * doesn't support explicitly. */ NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { + has_channel = true; + cap = nc->caps[NCSI_CAP_GENERIC].cap; if (!(cap & NCSI_CAP_GENERIC_HWA) || (cap & NCSI_CAP_GENERIC_HWA_MASK) != @@ -1021,8 +1024,13 @@ static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp) } } - ndp->flags |= NCSI_DEV_HWA; - return true; + if (has_channel) { + ndp->flags |= NCSI_DEV_HWA; + return true; + } + + ndp->flags &= ~NCSI_DEV_HWA; + return false; } static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp) -- cgit From 52b4c8627f9f0d882e969967a207a27a80c9c753 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 19 Oct 2017 13:43:08 +1100 Subject: net/ncsi: Enforce failover on link monitor timeout The NCSI channel has been configured to provide service if its link monitor timer is enabled, regardless of its state (inactive or active). So the timeout event on the link monitor indicates the out-of-service on that channel, for which a failover is needed. This sets NCSI_DEV_RESHUFFLE flag to enforce failover on link monitor timeout, regardless the channel's original state (inactive or active). Also, the link is put into "down" state to give the failing channel lowest priority when selecting for the active channel. The state of failing channel should be set to active in order for deinitialization and failover to be done. Signed-off-by: Gavin Shan Signed-off-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- net/ncsi/ncsi-manage.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 0966eff48ce7..28c42b22b748 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -189,6 +189,7 @@ static void ncsi_channel_monitor(unsigned long data) struct ncsi_channel *nc = (struct ncsi_channel *)data; struct ncsi_package *np = nc->package; struct ncsi_dev_priv *ndp = np->ndp; + struct ncsi_channel_mode *ncm; struct ncsi_cmd_arg nca; bool enabled, chained; unsigned int monitor_state; @@ -228,20 +229,21 @@ static void ncsi_channel_monitor(unsigned long data) case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX: break; default: - if (!(ndp->flags & NCSI_DEV_HWA) && - state == NCSI_CHANNEL_ACTIVE) { + if (!(ndp->flags & NCSI_DEV_HWA)) { ncsi_report_link(ndp, true); ndp->flags |= NCSI_DEV_RESHUFFLE; } ncsi_stop_channel_monitor(nc); + ncm = &nc->modes[NCSI_MODE_LINK]; spin_lock_irqsave(&nc->lock, flags); nc->state = NCSI_CHANNEL_INVISIBLE; + ncm->data[2] &= ~0x1; spin_unlock_irqrestore(&nc->lock, flags); spin_lock_irqsave(&ndp->lock, flags); - nc->state = NCSI_CHANNEL_INACTIVE; + nc->state = NCSI_CHANNEL_ACTIVE; list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); ncsi_process_next_channel(ndp); -- cgit From 0a90e251988ceedc528c8db98f25b051cf190f44 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 19 Oct 2017 13:43:09 +1100 Subject: net/ncsi: Fix length of GVI response packet The length of GVI (GetVersionInfo) response packet should be 40 instead of 36. This issue was found from /sys/kernel/debug/ncsi/eth0/stats. # ethtool --ncsi eth0 swstats : RESPONSE OK TIMEOUT ERROR ======================================= GVI 0 0 2 With this applied, no error reported on GVI response packets: # ethtool --ncsi eth0 swstats : RESPONSE OK TIMEOUT ERROR ======================================= GVI 2 0 0 Signed-off-by: Gavin Shan Signed-off-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- net/ncsi/ncsi-rsp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 265b9a892d41..927dad4759d1 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -959,7 +959,7 @@ static struct ncsi_rsp_handler { { NCSI_PKT_RSP_EGMF, 4, ncsi_rsp_handler_egmf }, { NCSI_PKT_RSP_DGMF, 4, ncsi_rsp_handler_dgmf }, { NCSI_PKT_RSP_SNFC, 4, ncsi_rsp_handler_snfc }, - { NCSI_PKT_RSP_GVI, 36, ncsi_rsp_handler_gvi }, + { NCSI_PKT_RSP_GVI, 40, ncsi_rsp_handler_gvi }, { NCSI_PKT_RSP_GC, 32, ncsi_rsp_handler_gc }, { NCSI_PKT_RSP_GP, -1, ncsi_rsp_handler_gp }, { NCSI_PKT_RSP_GCPS, 172, ncsi_rsp_handler_gcps }, -- cgit From b4562ca7925a3bedada87a3dd072dd5bad043288 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 19 Oct 2017 03:33:14 +0000 Subject: hv_sock: add locking in the open/close/release code paths Without the patch, when hvs_open_connection() hasn't completely established a connection (e.g. it has changed sk->sk_state to SS_CONNECTED, but hasn't inserted the sock into the connected queue), vsock_stream_connect() may see the sk_state change and return the connection to the userspace, and next when the userspace closes the connection quickly, hvs_release() may not see the connection in the connected queue; finally hvs_open_connection() inserts the connection into the queue, but we won't be able to purge the connection for ever. Signed-off-by: Dexuan Cui Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Cathy Avery Cc: Rolf Neugebauer Cc: Marcelo Cerri Signed-off-by: David S. Miller --- net/vmw_vsock/hyperv_transport.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 14ed5a344cdf..e21991fe883a 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -310,11 +310,15 @@ static void hvs_close_connection(struct vmbus_channel *chan) struct sock *sk = get_per_channel_state(chan); struct vsock_sock *vsk = vsock_sk(sk); + lock_sock(sk); + sk->sk_state = SS_UNCONNECTED; sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; sk->sk_state_change(sk); + + release_sock(sk); } static void hvs_open_connection(struct vmbus_channel *chan) @@ -344,6 +348,8 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (!sk) return; + lock_sock(sk); + if ((conn_from_host && sk->sk_state != VSOCK_SS_LISTEN) || (!conn_from_host && sk->sk_state != SS_CONNECTING)) goto out; @@ -395,9 +401,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) vsock_insert_connected(vnew); - lock_sock(sk); vsock_enqueue_accept(sk, new); - release_sock(sk); } else { sk->sk_state = SS_CONNECTED; sk->sk_socket->state = SS_CONNECTED; @@ -410,6 +414,8 @@ static void hvs_open_connection(struct vmbus_channel *chan) out: /* Release refcnt obtained when we called vsock_find_bound_socket() */ sock_put(sk); + + release_sock(sk); } static u32 hvs_get_local_cid(void) @@ -476,13 +482,21 @@ out: static void hvs_release(struct vsock_sock *vsk) { + struct sock *sk = sk_vsock(vsk); struct hvsock *hvs = vsk->trans; - struct vmbus_channel *chan = hvs->chan; + struct vmbus_channel *chan; + lock_sock(sk); + + sk->sk_state = SS_DISCONNECTING; + vsock_remove_sock(vsk); + + release_sock(sk); + + chan = hvs->chan; if (chan) hvs_shutdown(vsk, RCV_SHUTDOWN | SEND_SHUTDOWN); - vsock_remove_sock(vsk); } static void hvs_destruct(struct vsock_sock *vsk) -- cgit From 197df02cb3d3e969fb1d6fc11f5a634b7bfc2124 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 19 Oct 2017 14:22:17 +0200 Subject: udp: make some messages more descriptive In the UDP code there are two leftover error messages with very few meaning. Replace them with a more descriptive error message as some users reported them as "strange network error". Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e45177ceb0ee..806b298a3bdd 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1061,7 +1061,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - net_dbg_ratelimited("cork app bug 2\n"); + net_dbg_ratelimited("socket already corked\n"); err = -EINVAL; goto out; } @@ -1144,7 +1144,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, if (unlikely(!up->pending)) { release_sock(sk); - net_dbg_ratelimited("udp cork app bug 3\n"); + net_dbg_ratelimited("cork failed\n"); return -EINVAL; } -- cgit From 54d431176429e9cf064461589e5174349a9f73da Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 19 Oct 2017 12:40:39 -0400 Subject: sock: correct sk_wmem_queued accounting on efault in tcp zerocopy Syzkaller hits WARN_ON(sk->sk_wmem_queued) in sk_stream_kill_queues after triggering an EFAULT in __zerocopy_sg_from_iter. On this error, skb_zerocopy_stream_iter resets the skb to its state before the operation with __pskb_trim. It cannot kfree_skb like datagram callers, as the skb may have data from a previous send call. __pskb_trim calls skb_condense for unowned skbs, which adjusts their truesize. These tcp skbuffs are owned and their truesize must add up to sk_wmem_queued. But they match because their skb->sk is NULL until tcp_transmit_skb. Temporarily set skb->sk when calling __pskb_trim to signal that the skbuffs are owned and avoid the skb_condense path. Fixes: 52267790ef52 ("sock: add MSG_ZEROCOPY") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e62476beee95..24656076906d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1124,9 +1124,13 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { + struct sock *save_sk = skb->sk; + /* Streams do not free skb on error. Reset to prev state. */ msg->msg_iter = orig_iter; + skb->sk = sk; ___pskb_trim(skb, orig_len); + skb->sk = save_sk; return err; } -- cgit From 66c54517540cedf5a22911c6b7f5c7d8b5d1e1be Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 19 Oct 2017 20:17:32 +0300 Subject: net: bridge: fix returning of vlan range op errors When vlan tunnels were introduced, vlan range errors got silently dropped and instead 0 was returned always. Restore the previous behaviour and return errors to user-space. Fixes: efa5356b0d97 ("bridge: per vlan dst_metadata netlink support") Signed-off-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3bc890716c89..de2152730809 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -573,7 +573,7 @@ static int br_process_vlan_info(struct net_bridge *br, } *vinfo_last = NULL; - return 0; + return err; } return br_vlan_info(br, p, cmd, vinfo_curr); -- cgit From 1b5f962e71bfad6284574655c406597535c3ea7a Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Thu, 19 Oct 2017 15:00:29 -0400 Subject: soreuseport: fix initialization race Syzkaller stumbled upon a way to trigger WARNING: CPU: 1 PID: 13881 at net/core/sock_reuseport.c:41 reuseport_alloc+0x306/0x3b0 net/core/sock_reuseport.c:39 There are two initialization paths for the sock_reuseport structure in a socket: Through the udp/tcp bind paths of SO_REUSEPORT sockets or through SO_ATTACH_REUSEPORT_[CE]BPF before bind. The existing implementation assumedthat the socket lock protected both of these paths when it actually only protects the SO_ATTACH_REUSEPORT path. Syzkaller triggered this double allocation by running these paths concurrently. This patch moves the check for double allocation into the reuseport_alloc function which is protected by a global spin lock. Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Fixes: c125e80b8868 ("soreuseport: fast reuseport TCP socket selection") Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- net/core/sock_reuseport.c | 12 +++++++++--- net/ipv4/inet_hashtables.c | 5 +---- net/ipv4/udp.c | 5 +---- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index eed1ebf7f29d..b1e0dbea1e8c 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -36,9 +36,14 @@ int reuseport_alloc(struct sock *sk) * soft irq of receive path or setsockopt from process context */ spin_lock_bh(&reuseport_lock); - WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)), - "multiple allocations for the same socket"); + + /* Allocation attempts can occur concurrently via the setsockopt path + * and the bind/hash path. Nothing to do when we lose the race. + */ + if (rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock))) + goto out; + reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { spin_unlock_bh(&reuseport_lock); @@ -49,6 +54,7 @@ int reuseport_alloc(struct sock *sk) reuse->num_socks = 1; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); +out: spin_unlock_bh(&reuseport_lock); return 0; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 597bb4cfe805..e7d15fb0d94d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -456,10 +456,7 @@ static int inet_reuseport_add_sock(struct sock *sk, return reuseport_add_sock(sk, sk2); } - /* Initial allocation may have already happened via setsockopt */ - if (!rcu_access_pointer(sk->sk_reuseport_cb)) - return reuseport_alloc(sk); - return 0; + return reuseport_alloc(sk); } int __inet_hash(struct sock *sk, struct sock *osk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 806b298a3bdd..ebfbccae62fd 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -231,10 +231,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) } } - /* Initial allocation may have already happened via setsockopt */ - if (!rcu_access_pointer(sk->sk_reuseport_cb)) - return reuseport_alloc(sk); - return 0; + return reuseport_alloc(sk); } /** -- cgit From 95491e3cf37840c518d81e1a3a6a8ef554e03c54 Mon Sep 17 00:00:00 2001 From: Niklas Söderlund Date: Fri, 20 Oct 2017 01:32:08 +0200 Subject: net: ethtool: remove error check for legacy setting transceiver type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 9cab88726929605 ("net: ethtool: Add back transceiver type") restores the transceiver type to struct ethtool_link_settings and convert_link_ksettings_to_legacy_settings() but forgets to remove the error check for the same in convert_legacy_settings_to_link_ksettings(). This prevents older versions of ethtool to change link settings. # ethtool --version ethtool version 3.16 # ethtool -s eth0 autoneg on speed 100 duplex full Cannot set new settings: Invalid argument not setting speed not setting duplex not setting autoneg While newer versions of ethtool works. # ethtool --version ethtool version 4.10 # ethtool -s eth0 autoneg on speed 100 duplex full [ 57.703268] sh-eth ee700000.ethernet eth0: Link is Down [ 59.618227] sh-eth ee700000.ethernet eth0: Link is Up - 100Mbps/Full - flow control rx/tx Fixes: 19cab88726929605 ("net: ethtool: Add back transceiver type") Signed-off-by: Niklas Söderlund Reported-by: Renjith R V Tested-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- net/core/ethtool.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 3228411ada0f..9a9a3d77e327 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -436,7 +436,7 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); /* return false if legacy contained non-0 deprecated fields - * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated + * maxtxpkt/maxrxpkt. rest of ksettings always updated */ static bool convert_legacy_settings_to_link_ksettings( @@ -451,8 +451,7 @@ convert_legacy_settings_to_link_ksettings( * deprecated legacy fields, and they should not use * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS */ - if (legacy_settings->transceiver || - legacy_settings->maxtxpkt || + if (legacy_settings->maxtxpkt || legacy_settings->maxrxpkt) retval = false; -- cgit From 6cb3ece9685f78f9b288dd2afea58c35784e40b8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Oct 2017 17:01:22 +0100 Subject: rxrpc: Don't release call mutex on error pointer Don't release call mutex at the end of rxrpc_kernel_begin_call() if the call pointer actually holds an error value. Fixes: 540b1c48c37a ("rxrpc: Fix deadlock between call creation and sendmsg/recvmsg") Reported-by: Marc Dionne Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index fb17552fd292..4b0a8288c98a 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -308,10 +308,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, tx_total_len, gfp); /* The socket has been unlocked. */ - if (!IS_ERR(call)) + if (!IS_ERR(call)) { call->notify_rx = notify_rx; + mutex_unlock(&call->user_mutex); + } - mutex_unlock(&call->user_mutex); _leave(" = %p", call); return call; } -- cgit From 864e2a1f8aac05effac6063ce316b480facb46ff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 21 Oct 2017 12:26:23 -0700 Subject: ipv6: flowlabel: do not leave opt->tot_len with garbage When syzkaller team brought us a C repro for the crash [1] that had been reported many times in the past, I finally could find the root cause. If FlowLabel info is merged by fl6_merge_options(), we leave part of the opt_space storage provided by udp/raw/l2tp with random value in opt_space.tot_len, unless a control message was provided at sendmsg() time. Then ip6_setup_cork() would use this random value to perform a kzalloc() call. Undefined behavior and crashes. Fix is to properly set tot_len in fl6_merge_options() At the same time, we can also avoid consuming memory and cpu cycles to clear it, if every option is copied via a kmemdup(). This is the change in ip6_setup_cork(). [1] kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 6613 Comm: syz-executor0 Not tainted 4.14.0-rc4+ #127 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801cb64a100 task.stack: ffff8801cc350000 RIP: 0010:ip6_setup_cork+0x274/0x15c0 net/ipv6/ip6_output.c:1168 RSP: 0018:ffff8801cc357550 EFLAGS: 00010203 RAX: dffffc0000000000 RBX: ffff8801cc357748 RCX: 0000000000000010 RDX: 0000000000000002 RSI: ffffffff842bd1d9 RDI: 0000000000000014 RBP: ffff8801cc357620 R08: ffff8801cb17f380 R09: ffff8801cc357b10 R10: ffff8801cb64a100 R11: 0000000000000000 R12: ffff8801cc357ab0 R13: ffff8801cc357b10 R14: 0000000000000000 R15: ffff8801c3bbf0c0 FS: 00007f9c5c459700(0000) GS:ffff8801db200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020324000 CR3: 00000001d1cf2000 CR4: 00000000001406f0 DR0: 0000000020001010 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600 Call Trace: ip6_make_skb+0x282/0x530 net/ipv6/ip6_output.c:1729 udpv6_sendmsg+0x2769/0x3380 net/ipv6/udp.c:1340 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 SYSC_sendto+0x358/0x5a0 net/socket.c:1750 SyS_sendto+0x40/0x50 net/socket.c:1718 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4520a9 RSP: 002b:00007f9c5c458c08 EFLAGS: 00000216 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 00000000004520a9 RDX: 0000000000000001 RSI: 0000000020fd1000 RDI: 0000000000000016 RBP: 0000000000000086 R08: 0000000020e0afe4 R09: 000000000000001c R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004bb1ee R13: 00000000ffffffff R14: 0000000000000016 R15: 0000000000000029 Code: e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 ea 0f 00 00 48 8d 79 04 48 b8 00 00 00 00 00 fc ff df 45 8b 74 24 04 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 RIP: ip6_setup_cork+0x274/0x15c0 net/ipv6/ip6_output.c:1168 RSP: ffff8801cc357550 Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/ipv6/ip6_flowlabel.c | 1 + net/ipv6/ip6_output.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 8081bafe441b..15535ee327c5 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -315,6 +315,7 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, } opt_space->dst1opt = fopt->dst1opt; opt_space->opt_flen = fopt->opt_flen; + opt_space->tot_len = fopt->tot_len; return opt_space; } EXPORT_SYMBOL_GPL(fl6_merge_options); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 43ca864327c7..5110a418cc4d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1161,11 +1161,11 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (WARN_ON(v6_cork->opt)) return -EINVAL; - v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation); + v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); if (unlikely(!v6_cork->opt)) return -ENOBUFS; - v6_cork->opt->tot_len = opt->tot_len; + v6_cork->opt->tot_len = sizeof(*opt); v6_cork->opt->opt_flen = opt->opt_flen; v6_cork->opt->opt_nflen = opt->opt_nflen; -- cgit From 3a91d29f20276fa7cd4d0c9c7f3e78b30708159d Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Sun, 22 Oct 2017 13:13:16 +0900 Subject: tcp: do tcp_mstamp_refresh before retransmits on TSQ handler When retransmission on TSQ handler was introduced in the commit f9616c35a0d7 ("tcp: implement TSQ for retransmits"), the retransmitted skbs' timestamps were updated on the actual transmission. In the later commit 385e20706fac ("tcp: use tp->tcp_mstamp in output path"), it stops being done so. In the commit, the comment says "We try to refresh tp->tcp_mstamp only when necessary", and at present tcp_tsq_handler and tcp_v4_mtu_reduced applies to this. About the latter, it's okay since it's rare enough. About the former, even though possible retransmissions on the tasklet comes just after the destructor run in NET_RX softirq handling, the time between them could be nonnegligibly large to the extent that tcp_rack_advance or rto rearming be affected if other (remaining) RX, BLOCK and (preceding) TASKLET sofirq handlings are unexpectedly heavy. So in the same way as tcp_write_timer_handler does, doing tcp_mstamp_refresh ensures the accuracy of algorithms relying on it. Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path") Signed-off-by: Koichiro Den Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0bc9e46a5369..973befc36fd4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -739,8 +739,10 @@ static void tcp_tsq_handler(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tp->lost_out > tp->retrans_out && - tp->snd_cwnd > tcp_packets_in_flight(tp)) + tp->snd_cwnd > tcp_packets_in_flight(tp)) { + tcp_mstamp_refresh(tp); tcp_xmit_retransmit_queue(sk); + } tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, 0, GFP_ATOMIC); -- cgit From a6ca7abe53633d08eea1c6756cb49c9b2d4c90bf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Oct 2017 12:33:57 -0700 Subject: tcp/dccp: fix lockdep splat in inet_csk_route_req() This patch fixes the following lockdep splat in inet_csk_route_req() lockdep_rcu_suspicious inet_csk_route_req tcp_v4_send_synack tcp_rtx_synack inet_rtx_syn_ack tcp_fastopen_synack_time tcp_retransmit_timer tcp_write_timer_handler tcp_write_timer call_timer_fn Thread running inet_csk_route_req() owns a reference on the request socket, so we have the guarantee ireq->ireq_opt wont be changed or freed. lockdep can enforce this invariant for us. Fixes: c92e8c02fe66 ("tcp/dccp: fix ireq->opt races") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5ec9136a7c36..18cd2eae758f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -543,7 +543,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, struct ip_options_rcu *opt; struct rtable *rt; - opt = rcu_dereference(ireq->ireq_opt); + opt = rcu_dereference_protected(ireq->ireq_opt, + refcount_read(&req->rsk_refcnt) > 0); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), -- cgit From 1137b5e2529a8f5ca8ee709288ecba3e68044df2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 19 Oct 2017 20:51:10 +0800 Subject: ipsec: Fix aborted xfrm policy dump crash An independent security researcher, Mohamed Ghannam, has reported this vulnerability to Beyond Security's SecuriTeam Secure Disclosure program. The xfrm_dump_policy_done function expects xfrm_dump_policy to have been called at least once or it will crash. This can be triggered if a dump fails because the target socket's receive buffer is full. This patch fixes it by using the cb->start mechanism to ensure that the initialisation is always done regardless of the buffer situation. Fixes: 12a169e7d8f4 ("ipsec: Put dumpers on the dump list") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b997f1395357..e44a0fed48dd 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1693,32 +1693,34 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr static int xfrm_dump_policy_done(struct netlink_callback *cb) { - struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1]; + struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args; struct net *net = sock_net(cb->skb->sk); xfrm_policy_walk_done(walk, net); return 0; } +static int xfrm_dump_policy_start(struct netlink_callback *cb) +{ + struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args; + + BUILD_BUG_ON(sizeof(*walk) > sizeof(cb->args)); + + xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY); + return 0; +} + static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1]; + struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args; struct xfrm_dump_info info; - BUILD_BUG_ON(sizeof(struct xfrm_policy_walk) > - sizeof(cb->args) - sizeof(cb->args[0])); - info.in_skb = cb->skb; info.out_skb = skb; info.nlmsg_seq = cb->nlh->nlmsg_seq; info.nlmsg_flags = NLM_F_MULTI; - if (!cb->args[0]) { - cb->args[0] = 1; - xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY); - } - (void) xfrm_policy_walk(net, walk, dump_one_policy, &info); return skb->len; @@ -2474,6 +2476,7 @@ static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { static const struct xfrm_link { int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); + int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *, struct netlink_callback *); int (*done)(struct netlink_callback *); const struct nla_policy *nla_pol; @@ -2487,6 +2490,7 @@ static const struct xfrm_link { [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_add_policy }, [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_get_policy }, [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_get_policy, + .start = xfrm_dump_policy_start, .dump = xfrm_dump_policy, .done = xfrm_dump_policy_done }, [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = { .doit = xfrm_alloc_userspi }, @@ -2539,6 +2543,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, { struct netlink_dump_control c = { + .start = link->start, .dump = link->dump, .done = link->done, }; -- cgit From b71d21c274eff20a9db8158882b545b141b73ab8 Mon Sep 17 00:00:00 2001 From: Laszlo Toth Date: Mon, 23 Oct 2017 19:19:33 +0200 Subject: sctp: full support for ipv6 ip_nonlocal_bind & IP_FREEBIND Commit 9b9742022888 ("sctp: support ipv6 nonlocal bind") introduced support for the above options as v4 sctp did, so patched sctp_v6_available(). In the v4 implementation it's enough, because sctp_inet_bind_verify() just returns with sctp_v4_available(). However sctp_inet6_bind_verify() has an extra check before that for link-local scope_id, which won't respect the above options. Added the checks before calling ipv6_chk_addr(), but not before the validation of scope_id. before (w/ both options): ./v6test fe80::10 sctp bind failed, errno: 99 (Cannot assign requested address) ./v6test fe80::10 tcp bind success, errno: 0 (Success) after (w/ both options): ./v6test fe80::10 sctp bind success, errno: 0 (Success) Signed-off-by: Laszlo Toth Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 51c488769590..7fe9e1d1b7ec 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -882,8 +882,10 @@ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) net = sock_net(&opt->inet.sk); rcu_read_lock(); dev = dev_get_by_index_rcu(net, addr->v6.sin6_scope_id); - if (!dev || - !ipv6_chk_addr(net, &addr->v6.sin6_addr, dev, 0)) { + if (!dev || !(opt->inet.freebind || + net->ipv6.sysctl.ip_nonlocal_bind || + ipv6_chk_addr(net, &addr->v6.sin6_addr, + dev, 0))) { rcu_read_unlock(); return 0; } -- cgit From 829385f08ae99740276cbd46c9db29764c519211 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 20 Oct 2017 16:40:43 -0700 Subject: strparser: Use delayed work instead of timer for msg timeout Sock lock may be taken in the message timer function which is a problem since timers run in BH. Instead of timers use delayed_work. Reported-by: Eric Dumazet Fixes: bbb03029a899 ("strparser: Generalize strparser") Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/strparser/strparser.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index d4ea46a5f233..c5fda15ba319 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -49,7 +49,7 @@ static void strp_abort_strp(struct strparser *strp, int err) { /* Unrecoverable error in receive */ - del_timer(&strp->msg_timer); + cancel_delayed_work(&strp->msg_timer_work); if (strp->stopped) return; @@ -68,7 +68,7 @@ static void strp_abort_strp(struct strparser *strp, int err) static void strp_start_timer(struct strparser *strp, long timeo) { if (timeo) - mod_timer(&strp->msg_timer, timeo); + mod_delayed_work(strp_wq, &strp->msg_timer_work, timeo); } /* Lower lock held */ @@ -319,7 +319,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, eaten += (cand_len - extra); /* Hurray, we have a new message! */ - del_timer(&strp->msg_timer); + cancel_delayed_work(&strp->msg_timer_work); strp->skb_head = NULL; STRP_STATS_INCR(strp->stats.msgs); @@ -450,9 +450,10 @@ static void strp_work(struct work_struct *w) do_strp_work(container_of(w, struct strparser, work)); } -static void strp_msg_timeout(unsigned long arg) +static void strp_msg_timeout(struct work_struct *w) { - struct strparser *strp = (struct strparser *)arg; + struct strparser *strp = container_of(w, struct strparser, + msg_timer_work.work); /* Message assembly timed out */ STRP_STATS_INCR(strp->stats.msg_timeouts); @@ -505,9 +506,7 @@ int strp_init(struct strparser *strp, struct sock *sk, strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp; - setup_timer(&strp->msg_timer, strp_msg_timeout, - (unsigned long)strp); - + INIT_DELAYED_WORK(&strp->msg_timer_work, strp_msg_timeout); INIT_WORK(&strp->work, strp_work); return 0; @@ -532,7 +531,7 @@ void strp_done(struct strparser *strp) { WARN_ON(!strp->stopped); - del_timer_sync(&strp->msg_timer); + cancel_delayed_work_sync(&strp->msg_timer_work); cancel_work_sync(&strp->work); if (strp->skb_head) { -- cgit From 3eb8feeb1708c7dbfd2e97df92a2a407c116606e Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 24 Oct 2017 16:37:19 -0400 Subject: net: dsa: check master device before put In the case of pdata, the dsa_cpu_parse function calls dev_put() before making sure it isn't NULL. Fix this. Fixes: 71e0bbde0d88 ("net: dsa: Add support for platform data") Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 873af0108e24..045d8a176279 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -496,14 +496,15 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, if (!ethernet) return -EINVAL; ethernet_dev = of_find_net_device_by_node(ethernet); + if (!ethernet_dev) + return -EPROBE_DEFER; } else { ethernet_dev = dsa_dev_to_net_device(ds->cd->netdev[index]); + if (!ethernet_dev) + return -EPROBE_DEFER; dev_put(ethernet_dev); } - if (!ethernet_dev) - return -EPROBE_DEFER; - if (!dst->cpu_dp) { dst->cpu_dp = port; dst->cpu_dp->netdev = ethernet_dev; -- cgit From cfbb0d90a7abb289edc91833d0905931f8805f12 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 24 Oct 2017 21:12:13 +0200 Subject: mac80211: don't compare TKIP TX MIC key in reinstall prevention For the reinstall prevention, the code I had added compares the whole key. It turns out though that iwlwifi firmware doesn't provide the TKIP TX MIC key as it's not needed in client mode, and thus the comparison will always return false. For client mode, thus always zero out the TX MIC key part before doing the comparison in order to avoid accepting the reinstall of the key with identical encryption and RX MIC key, but not the same TX MIC key (since the supplicant provides the real one.) Fixes: fdf7cb4185b6 ("mac80211: accept key reinstall without changing anything") Signed-off-by: Johannes Berg --- net/mac80211/key.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 035d16fe926e..938049395f90 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -610,6 +610,39 @@ void ieee80211_key_free_unused(struct ieee80211_key *key) ieee80211_key_free_common(key); } +static bool ieee80211_key_identical(struct ieee80211_sub_if_data *sdata, + struct ieee80211_key *old, + struct ieee80211_key *new) +{ + u8 tkip_old[WLAN_KEY_LEN_TKIP], tkip_new[WLAN_KEY_LEN_TKIP]; + u8 *tk_old, *tk_new; + + if (!old || new->conf.keylen != old->conf.keylen) + return false; + + tk_old = old->conf.key; + tk_new = new->conf.key; + + /* + * In station mode, don't compare the TX MIC key, as it's never used + * and offloaded rekeying may not care to send it to the host. This + * is the case in iwlwifi, for example. + */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + new->conf.cipher == WLAN_CIPHER_SUITE_TKIP && + new->conf.keylen == WLAN_KEY_LEN_TKIP && + !(new->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) { + memcpy(tkip_old, tk_old, WLAN_KEY_LEN_TKIP); + memcpy(tkip_new, tk_new, WLAN_KEY_LEN_TKIP); + memset(tkip_old + NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY, 0, 8); + memset(tkip_new + NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY, 0, 8); + tk_old = tkip_old; + tk_new = tkip_new; + } + + return !crypto_memneq(tk_old, tk_new, new->conf.keylen); +} + int ieee80211_key_link(struct ieee80211_key *key, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) @@ -635,8 +668,7 @@ int ieee80211_key_link(struct ieee80211_key *key, * Silently accept key re-installation without really installing the * new version of the key to avoid nonce reuse or replay issues. */ - if (old_key && key->conf.keylen == old_key->conf.keylen && - !crypto_memneq(key->conf.key, old_key->conf.key, key->conf.keylen)) { + if (ieee80211_key_identical(sdata, old_key, key)) { ieee80211_key_free_unused(key); ret = 0; goto out; -- cgit From 0f5da659d8f1810f44de14acf2c80cd6499623a0 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 25 Oct 2017 10:16:42 -0700 Subject: net/unix: don't show information about sockets from other namespaces socket_diag shows information only about sockets from a namespace where a diag socket lives. But if we request information about one unix socket, the kernel don't check that its netns is matched with a diag socket namespace, so any user can get information about any unix socket in a system. This looks like a bug. v2: add a Fixes tag Fixes: 51d7cccf0723 ("net: make sock diag per-namespace") Signed-off-by: Andrei Vagin Signed-off-by: David S. Miller --- net/unix/diag.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/unix/diag.c b/net/unix/diag.c index 4d9679701a6d..384c84e83462 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -257,6 +257,8 @@ static int unix_diag_get_exact(struct sk_buff *in_skb, err = -ENOENT; if (sk == NULL) goto out_nosk; + if (!net_eq(sock_net(sk), net)) + goto out; err = sock_diag_check_cookie(sk, req->udiag_cookie); if (err) -- cgit From e9a0b99804ff662d02b78a556a84e22308066fe1 Mon Sep 17 00:00:00 2001 From: Håkon Bugge Date: Tue, 24 Oct 2017 18:17:18 +0200 Subject: rds: ib: Fix uninitialized variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit send_flags needs to be initialized before calling rds_ib_set_wr_signal_state(). Signed-off-by: Håkon Bugge Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_send.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 6ab39dbcca01..8f46755477ae 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -792,6 +792,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask; send->s_atomic_wr.swap_mask = 0; } + send->s_wr.send_flags = 0; nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); send->s_atomic_wr.wr.num_sge = 1; send->s_atomic_wr.wr.next = NULL; -- cgit From a0c0865fa0abcbc142c11fabec3a2bffc1a4229d Mon Sep 17 00:00:00 2001 From: Håkon Bugge Date: Tue, 24 Oct 2017 16:16:28 +0200 Subject: rds: Fix inaccurate accounting of unsignaled wrs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The number of unsignaled work-requests posted to the IB send queue is tracked by a counter in the rds_ib_connection struct. When it reaches zero, or the caller explicitly asks for it, the send-signaled bit is set in send_flags and the counter is reset. This is performed by the rds_ib_set_wr_signal_state() function. However, this function is not always used which yields inaccurate accounting. This commit fixes this, re-factors a code bloat related to the matter, and makes the actual parameter type to the function consistent. Signed-off-by: Håkon Bugge Signed-off-by: David S. Miller --- net/rds/ib_send.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 8f46755477ae..8557a1cae041 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -661,13 +661,15 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } } - rds_ib_set_wr_signal_state(ic, send, 0); + rds_ib_set_wr_signal_state(ic, send, false); /* * Always signal the last one if we're stopping due to flow control. */ - if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + if (ic->i_flowctl && flow_controlled && i == (work_alloc - 1)) { + rds_ib_set_wr_signal_state(ic, send, true); + send->s_wr.send_flags |= IB_SEND_SOLICITED; + } if (send->s_wr.send_flags & IB_SEND_SIGNALED) nr_sig++; @@ -705,11 +707,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_op = ic->i_data_op; prev->s_wr.send_flags |= IB_SEND_SOLICITED; - if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) { - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - prev->s_wr.send_flags |= IB_SEND_SIGNALED; - nr_sig++; - } + if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) + nr_sig += rds_ib_set_wr_signal_state(ic, prev, true); ic->i_data_op = NULL; } -- cgit From 06f877d613be3621604c2520ec0351d9fbdca15f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Oct 2017 08:20:31 -0700 Subject: tcp/dccp: fix other lockdep splats accessing ireq_opt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In my first attempt to fix the lockdep splat, I forgot we could enter inet_csk_route_req() with a freshly allocated request socket, for which refcount has not yet been elevated, due to complex SLAB_TYPESAFE_BY_RCU rules. We either are in rcu_read_lock() section _or_ we own a refcount on the request. Correct RCU verb to use here is rcu_dereference_check(), although it is not possible to prove we actually own a reference on a shared refcount :/ In v2, I added ireq_opt_deref() helper and use in three places, to fix other possible splats. [ 49.844590] lockdep_rcu_suspicious+0xea/0xf3 [ 49.846487] inet_csk_route_req+0x53/0x14d [ 49.848334] tcp_v4_route_req+0xe/0x10 [ 49.850174] tcp_conn_request+0x31c/0x6a0 [ 49.851992] ? __lock_acquire+0x614/0x822 [ 49.854015] tcp_v4_conn_request+0x5a/0x79 [ 49.855957] ? tcp_v4_conn_request+0x5a/0x79 [ 49.858052] tcp_rcv_state_process+0x98/0xdcc [ 49.859990] ? sk_filter_trim_cap+0x2f6/0x307 [ 49.862085] tcp_v4_do_rcv+0xfc/0x145 [ 49.864055] ? tcp_v4_do_rcv+0xfc/0x145 [ 49.866173] tcp_v4_rcv+0x5ab/0xaf9 [ 49.868029] ip_local_deliver_finish+0x1af/0x2e7 [ 49.870064] ip_local_deliver+0x1b2/0x1c5 [ 49.871775] ? inet_del_offload+0x45/0x45 [ 49.873916] ip_rcv_finish+0x3f7/0x471 [ 49.875476] ip_rcv+0x3f1/0x42f [ 49.876991] ? ip_local_deliver_finish+0x2e7/0x2e7 [ 49.878791] __netif_receive_skb_core+0x6d3/0x950 [ 49.880701] ? process_backlog+0x7e/0x216 [ 49.882589] __netif_receive_skb+0x1d/0x5e [ 49.884122] process_backlog+0x10c/0x216 [ 49.885812] net_rx_action+0x147/0x3df Fixes: a6ca7abe53633 ("tcp/dccp: fix lockdep splat in inet_csk_route_req()") Fixes: c92e8c02fe66 ("tcp/dccp: fix ireq->opt races") Signed-off-by: Eric Dumazet Reported-by: kernel test robot Reported-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 2 +- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/tcp_ipv4.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 0490916864f9..e65fcb45c3f6 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -495,7 +495,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req ireq->ir_rmt_addr); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt)); + ireq_opt_deref(ireq)); err = net_xmit_eval(err); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 18cd2eae758f..b47a59cb3573 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -543,8 +543,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, struct ip_options_rcu *opt; struct rtable *rt; - opt = rcu_dereference_protected(ireq->ireq_opt, - refcount_read(&req->rsk_refcnt) > 0); + opt = ireq_opt_deref(ireq); + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4c43365c374c..5b027c69cbc5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -877,7 +877,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt)); + ireq_opt_deref(ireq)); err = net_xmit_eval(err); } -- cgit From 5889e2c0e441d84060e66211ed5c4517ca591167 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Tue, 24 Oct 2017 16:44:42 -0700 Subject: tcp: call tcp_rate_skb_sent() when retransmit with unaligned skb->data Current implementation calls tcp_rate_skb_sent() when tcp_transmit_skb() is called when it clones skb only. Not calling tcp_rate_skb_sent() is OK for all such code paths except from __tcp_retransmit_skb() which happens when skb->data address is not aligned. This may rarely happen e.g. when small amount of data is sent initially and the receiver partially acks odd number of bytes for some reason, possibly malicious. Signed-off-by: Yousuk Seung Signed-off-by: Neal Cardwell Signed-off-by: Soheil Hassas Yeganeh Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 973befc36fd4..1151870018e3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2843,8 +2843,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; - if (!err) + if (!err) { skb->skb_mstamp = tp->tcp_mstamp; + tcp_rate_skb_sent(sk, skb); + } } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } -- cgit From f3594f0a7ea36661d7fd942facd7f31a64245f1a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 26 Oct 2017 19:19:56 +0800 Subject: ipip: only increase err_count for some certain type icmp in ipip_err t->err_count is used to count the link failure on tunnel and an err will be reported to user socket in tx path if t->err_count is not 0. udp socket could even return EHOSTUNREACH to users. Since commit fd58156e456d ("IPIP: Use ip-tunneling code.") removed the 'switch check' for icmp type in ipip_err(), err_count would be increased by the icmp packet with ICMP_EXC_FRAGTIME code. an link failure would be reported out due to this. In Jianlin's case, when receiving ICMP_EXC_FRAGTIME a icmp packet, udp netperf failed with the err: send_data: data send error: No route to host (errno 113) We expect this error reported from tunnel to socket when receiving some certain type icmp, but not ICMP_EXC_FRAGTIME, ICMP_SR_FAILED or ICMP_PARAMETERPROB ones. This patch is to bring 'switch check' for icmp type back to ipip_err so that it only reports link failure for the right type icmp, just as in ipgre_err() and ipip6_err(). Fixes: fd58156e456d ("IPIP: Use ip-tunneling code.") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 59 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index fb1ad22b5e29..cdd627355ed1 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -128,43 +128,68 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly; static int ipip_err(struct sk_buff *skb, u32 info) { - -/* All the routers (except for Linux) return only - 8 bytes of packet payload. It means, that precise relaying of - ICMP in the real Internet is absolutely infeasible. - */ + /* All the routers (except for Linux) return only + * 8 bytes of packet payload. It means, that precise relaying of + * ICMP in the real Internet is absolutely infeasible. + */ struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; - struct ip_tunnel *t; - int err; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + struct ip_tunnel *t; + int err = 0; + + switch (type) { + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + /* Impossible event. */ + goto out; + default: + /* All others are translated to HOST_UNREACH. + * rfc2003 contains "deep thoughts" about NET_UNREACH, + * I believe they are just ether pollution. --ANK + */ + break; + } + break; + + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + goto out; + break; + + case ICMP_REDIRECT: + break; + + default: + goto out; + } - err = -ENOENT; t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->daddr, iph->saddr, 0); - if (!t) + if (!t) { + err = -ENOENT; goto out; + } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->parms.link, 0, iph->protocol, 0); - err = 0; + ipv4_update_pmtu(skb, net, info, t->parms.link, 0, + iph->protocol, 0); goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, - iph->protocol, 0); - err = 0; + ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0); goto out; } - if (t->parms.iph.daddr == 0) + if (t->parms.iph.daddr == 0) { + err = -ENOENT; goto out; + } - err = 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; -- cgit From f8d20b46ce55cf40afb30dcef6d9288f7ef46d9b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 26 Oct 2017 19:23:27 +0800 Subject: ip6_gre: only increase err_count for some certain type icmpv6 in ip6gre_err The similar fix in patch 'ipip: only increase err_count for some certain type icmp in ipip_err' is needed for ip6gre_err. In Jianlin's case, udp netperf broke even when receiving a TooBig icmpv6 packet. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 1602b491b281..fb595e8dc15b 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -408,13 +408,16 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); - break; + if (code != ICMPV6_PORT_UNREACH) + break; + return; case ICMPV6_TIME_EXCEED: if (code == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); + break; } - break; + return; case ICMPV6_PARAMPROB: teli = 0; if (code == ICMPV6_HDR_FIELD) @@ -430,7 +433,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } - break; + return; case ICMPV6_PKT_TOOBIG: mtu = be32_to_cpu(info) - offset - t->tun_hlen; if (t->dev->type == ARPHRD_ETHER) @@ -438,7 +441,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; - break; + return; } if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) -- cgit From 8aec4959d832bae0889a8e2f348973b5e4abffef Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 26 Oct 2017 19:27:17 +0800 Subject: ip6_gre: update dst pmtu if dev mtu has been updated by toobig in __gre6_xmit When receiving a Toobig icmpv6 packet, ip6gre_err would just set tunnel dev's mtu, that's not enough. For skb_dst(skb)'s pmtu may still be using the old value, it has no chance to be updated with tunnel dev's mtu. Jianlin found this issue by reducing route's mtu while running netperf, the performance went to 0. ip6ip6 and ip4ip6 tunnel can work well with this, as they lookup the upper dst and update_pmtu it's pmtu or icmpv6_send a Toobig to upper socket after setting tunnel dev's mtu. We couldn't do that for ip6_gre, as gre's inner packet could be any protocol, it's difficult to handle them (like lookup upper dst) in a good way. So this patch is to fix it by updating skb_dst(skb)'s pmtu when dev->mtu < skb_dst(skb)'s pmtu in tx path. It's safe to do this update there, as usually dev->mtu <= skb_dst(skb)'s pmtu and no performance regression can be caused by this. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index fb595e8dc15b..59c121b932ac 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -503,8 +503,8 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); - __be16 protocol = (dev->type == ARPHRD_ETHER) ? - htons(ETH_P_TEB) : proto; + struct dst_entry *dst = skb_dst(skb); + __be16 protocol; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; @@ -518,9 +518,14 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, tunnel->o_seqno++; /* Push GRE header. */ + protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + /* TooBig packet may have updated dst->dev's mtu */ + if (dst && dst_mtu(dst) > dst->dev->mtu) + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); + return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); } -- cgit From ee1836aec4f5a977c1699a311db4d9027ef21ac8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:21:40 -0700 Subject: tcp: refresh tp timestamp before tcp_mtu_probe() In the unlikely event tcp_mtu_probe() is sending a packet, we want tp->tcp_mstamp being as accurate as possible. This means we need to call tcp_mstamp_refresh() a bit earlier in tcp_write_xmit(). Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1151870018e3..ae60dd3faed0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2239,6 +2239,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0; + tcp_mstamp_refresh(tp); if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); @@ -2250,7 +2251,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } max_segs = tcp_tso_segs(sk, mss_now); - tcp_mstamp_refresh(tp); while ((skb = tcp_send_head(sk))) { unsigned int limit; -- cgit From 8108a77515126f6db4374e8593956e20430307c0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 27 Oct 2017 09:45:34 -0700 Subject: bpf: bpf_compute_data uses incorrect cb structure SK_SKB program types use bpf_compute_data to store the end of the packet data. However, bpf_compute_data assumes the cb is stored in the qdisc layer format. But, for SK_SKB this is the wrong layer of the stack for this type. It happens to work (sort of!) because in most cases nothing happens to be overwritten today. This is very fragile and error prone. Fortunately, we have another hole in tcp_skb_cb we can use so lets put the data_end value there. Note, SK_SKB program types do not use data_meta, they are failed by sk_skb_is_valid_access(). Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index aa0265997f93..68eaa2f81a8e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4243,6 +4243,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct __sk_buff, data_end): + off = si->off; + off -= offsetof(struct __sk_buff, data_end); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct tcp_skb_cb, bpf.data_end); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + default: + return bpf_convert_ctx_access(type, si, insn_buf, prog, + target_size); + } + + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_prog_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -4301,7 +4326,7 @@ const struct bpf_verifier_ops sock_ops_prog_ops = { const struct bpf_verifier_ops sk_skb_prog_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, - .convert_ctx_access = bpf_convert_ctx_access, + .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; -- cgit From bfa640757e9378c2f26867e723f1287e94f5a7ad Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 27 Oct 2017 09:45:53 -0700 Subject: bpf: rename sk_actions to align with bpf infrastructure Recent additions to support multiple programs in cgroups impose a strict requirement, "all yes is yes, any no is no". To enforce this the infrastructure requires the 'no' return code, SK_DROP in this case, to be 0. To apply these rules to SK_SKB program types the sk_actions return codes need to be adjusted. This fix adds SK_PASS and makes 'SK_DROP = 0'. Finally, remove SK_ABORTED to remove any chance that the API may allow aborted program flows to be passed up the stack. This would be incorrect behavior and allow programs to break existing policies. Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 68eaa2f81a8e..6ae94f825f72 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1844,14 +1844,15 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + /* If user passes invalid input drop the packet. */ if (unlikely(flags)) - return SK_ABORTED; + return SK_DROP; tcb->bpf.key = key; tcb->bpf.flags = flags; tcb->bpf.map = map; - return SK_REDIRECT; + return SK_PASS; } struct sock *do_sk_redirect_map(struct sk_buff *skb) -- cgit From d04adf1b355181e737b6b1e23d801b07f0b7c4c0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Oct 2017 02:13:29 +0800 Subject: sctp: reset owner sk for data chunks on out queues when migrating a sock Now when migrating sock to another one in sctp_sock_migrate(), it only resets owner sk for the data in receive queues, not the chunks on out queues. It would cause that data chunks length on the sock is not consistent with sk sk_wmem_alloc. When closing the sock or freeing these chunks, the old sk would never be freed, and the new sock may crash due to the overflow sk_wmem_alloc. syzbot found this issue with this series: r0 = socket$inet_sctp() sendto$inet(r0) listen(r0) accept4(r0) close(r0) Although listen() should have returned error when one TCP-style socket is in connecting (I may fix this one in another patch), it could also be reproduced by peeling off an assoc. This issue is there since very beginning. This patch is to reset owner sk for the chunks on out queues so that sk sk_wmem_alloc has correct value after accept one sock or peeloff an assoc to one sock. Note that when resetting owner sk for chunks on outqueue, it has to sctp_clear_owner_w/skb_orphan chunks before changing assoc->base.sk first and then sctp_set_owner_w them after changing assoc->base.sk, due to that sctp_wfree and it's callees are using assoc->base.sk. Reported-by: Dmitry Vyukov Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 17841ab30798..6f45d1713452 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -170,6 +170,36 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) sk_mem_charge(sk, chunk->skb->truesize); } +static void sctp_clear_owner_w(struct sctp_chunk *chunk) +{ + skb_orphan(chunk->skb); +} + +static void sctp_for_each_tx_datachunk(struct sctp_association *asoc, + void (*cb)(struct sctp_chunk *)) + +{ + struct sctp_outq *q = &asoc->outqueue; + struct sctp_transport *t; + struct sctp_chunk *chunk; + + list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) + list_for_each_entry(chunk, &t->transmitted, transmitted_list) + cb(chunk); + + list_for_each_entry(chunk, &q->retransmit, list) + cb(chunk); + + list_for_each_entry(chunk, &q->sacked, list) + cb(chunk); + + list_for_each_entry(chunk, &q->abandoned, list) + cb(chunk); + + list_for_each_entry(chunk, &q->out_chunk_list, list) + cb(chunk); +} + /* Verify that this is a valid address. */ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, int len) @@ -8212,7 +8242,9 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, * paths won't try to lock it and then oldsk. */ lock_sock_nested(newsk, SINGLE_DEPTH_NESTING); + sctp_for_each_tx_datachunk(assoc, sctp_clear_owner_w); sctp_assoc_migrate(assoc, newsk); + sctp_for_each_tx_datachunk(assoc, sctp_set_owner_w); /* If the association on the newsk is already closed before accept() * is called, set RCV_SHUTDOWN flag. -- cgit From 50317fce2cc70a2bbbc4b42c31bbad510382a53c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 27 Oct 2017 22:08:56 -0700 Subject: net_sched: avoid matching qdisc with zero handle Davide found the following script triggers a NULL pointer dereference: ip l a name eth0 type dummy tc q a dev eth0 parent :1 handle 1: htb This is because for a freshly created netdevice noop_qdisc is attached and when passing 'parent :1', kernel actually tries to match the major handle which is 0 and noop_qdisc has handle 0 so is matched by mistake. Commit 69012ae425d7 tries to fix a similar bug but still misses this case. Handle 0 is not a valid one, should be just skipped. In fact, kernel uses it as TC_H_UNSPEC. Fixes: 69012ae425d7 ("net: sched: fix handling of singleton qdiscs with qdisc_hash") Fixes: 59cc1f61f09c ("net: sched:convert qdisc linked list to hashtable") Reported-by: Davide Caratti Cc: Jiri Kosina Cc: Eric Dumazet Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c6deb74e3d2f..22bc6fc48311 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -301,6 +301,8 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { struct Qdisc *q; + if (!handle) + return NULL; q = qdisc_match_from_root(dev->qdisc, handle); if (q) goto out; -- cgit From 1da4fc97cbf89514e417a3df46eaec864a9b8a48 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Oct 2017 19:43:54 +0800 Subject: sctp: fix some type cast warnings introduced by stream reconf These warnings were found by running 'make C=2 M=net/sctp/'. They are introduced by not aware of Endian when coding stream reconf patches. Since commit c0d8bab6ae51 ("sctp: add get and set sockopt for reconf_enable") enabled stream reconf feature for users, the Fixes tag below would use it. Fixes: c0d8bab6ae51 ("sctp: add get and set sockopt for reconf_enable") Reported-by: Eric Dumazet Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 5 +++-- net/sctp/stream.c | 26 +++++++++++++++++--------- net/sctp/ulpevent.c | 2 +- 3 files changed, 21 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index ca8f196b6c6c..57c55045f5a7 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3591,7 +3591,7 @@ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc, */ struct sctp_chunk *sctp_make_strreset_req( const struct sctp_association *asoc, - __u16 stream_num, __u16 *stream_list, + __u16 stream_num, __be16 *stream_list, bool out, bool in) { struct sctp_strreset_outreq outreq; @@ -3788,7 +3788,8 @@ bool sctp_verify_reconf(const struct sctp_association *asoc, { struct sctp_reconf_chunk *hdr; union sctp_params param; - __u16 last = 0, cnt = 0; + __be16 last = 0; + __u16 cnt = 0; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr, params) { diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 63ea15503714..fa8371ff05c4 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -118,6 +118,7 @@ int sctp_send_reset_streams(struct sctp_association *asoc, __u16 i, str_nums, *str_list; struct sctp_chunk *chunk; int retval = -EINVAL; + __be16 *nstr_list; bool out, in; if (!asoc->peer.reconf_capable || @@ -148,13 +149,18 @@ int sctp_send_reset_streams(struct sctp_association *asoc, if (str_list[i] >= stream->incnt) goto out; + nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL); + if (!nstr_list) { + retval = -ENOMEM; + goto out; + } + for (i = 0; i < str_nums; i++) - str_list[i] = htons(str_list[i]); + nstr_list[i] = htons(str_list[i]); - chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in); + chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in); - for (i = 0; i < str_nums; i++) - str_list[i] = ntohs(str_list[i]); + kfree(nstr_list); if (!chunk) { retval = -ENOMEM; @@ -305,7 +311,7 @@ out: } static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param( - struct sctp_association *asoc, __u32 resp_seq, + struct sctp_association *asoc, __be32 resp_seq, __be16 type) { struct sctp_chunk *chunk = asoc->strreset_chunk; @@ -345,8 +351,9 @@ struct sctp_chunk *sctp_process_strreset_outreq( { struct sctp_strreset_outreq *outreq = param.v; struct sctp_stream *stream = &asoc->stream; - __u16 i, nums, flags = 0, *str_p = NULL; __u32 result = SCTP_STRRESET_DENIED; + __u16 i, nums, flags = 0; + __be16 *str_p = NULL; __u32 request_seq; request_seq = ntohl(outreq->request_seq); @@ -439,8 +446,9 @@ struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; - __u16 i, nums, *str_p; __u32 request_seq; + __u16 i, nums; + __be16 *str_p; request_seq = ntohl(inreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || @@ -769,7 +777,7 @@ struct sctp_chunk *sctp_process_strreset_resp( if (req->type == SCTP_PARAM_RESET_OUT_REQUEST) { struct sctp_strreset_outreq *outreq; - __u16 *str_p; + __be16 *str_p; outreq = (struct sctp_strreset_outreq *)req; str_p = outreq->list_of_streams; @@ -794,7 +802,7 @@ struct sctp_chunk *sctp_process_strreset_resp( nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_IN_REQUEST) { struct sctp_strreset_inreq *inreq; - __u16 *str_p; + __be16 *str_p; /* if the result is performed, it's impossible for inreq */ if (result == SCTP_STRRESET_PERFORMED) diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 67abc0194f30..5447228bf1a0 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -847,7 +847,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event( struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( const struct sctp_association *asoc, __u16 flags, __u16 stream_num, - __u16 *stream_list, gfp_t gfp) + __be16 *stream_list, gfp_t gfp) { struct sctp_stream_reset_event *sreset; struct sctp_ulpevent *event; -- cgit From 8d32503efde82db4e0a370981e90628ebd6718b5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Oct 2017 19:43:55 +0800 Subject: sctp: fix some type cast warnings introduced by transport rhashtable These warnings were found by running 'make C=2 M=net/sctp/'. They are introduced by not aware of Endian for the port when coding transport rhashtable patches. Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport rhashtable") Reported-by: Eric Dumazet Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/input.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 34f10e75f3b9..621b5ca3fd1c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -794,7 +794,7 @@ hit: struct sctp_hash_cmp_arg { const union sctp_addr *paddr; const struct net *net; - u16 lport; + __be16 lport; }; static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, @@ -820,37 +820,37 @@ out: return err; } -static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed) +static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; const union sctp_addr *paddr = &t->ipaddr; const struct net *net = sock_net(t->asoc->base.sk); - u16 lport = htons(t->asoc->base.bind_addr.port); - u32 addr; + __be16 lport = htons(t->asoc->base.bind_addr.port); + __u32 addr; if (paddr->sa.sa_family == AF_INET6) addr = jhash(&paddr->v6.sin6_addr, 16, seed); else - addr = paddr->v4.sin_addr.s_addr; + addr = (__force __u32)paddr->v4.sin_addr.s_addr; - return jhash_3words(addr, ((__u32)paddr->v4.sin_port) << 16 | + return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | (__force __u32)lport, net_hash_mix(net), seed); } -static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed) +static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed) { const struct sctp_hash_cmp_arg *x = data; const union sctp_addr *paddr = x->paddr; const struct net *net = x->net; - u16 lport = x->lport; - u32 addr; + __be16 lport = x->lport; + __u32 addr; if (paddr->sa.sa_family == AF_INET6) addr = jhash(&paddr->v6.sin6_addr, 16, seed); else - addr = paddr->v4.sin_addr.s_addr; + addr = (__force __u32)paddr->v4.sin_addr.s_addr; - return jhash_3words(addr, ((__u32)paddr->v4.sin_port) << 16 | + return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | (__force __u32)lport, net_hash_mix(net), seed); } -- cgit From f6fc6bc0b8e0bb13a210bd7386ffdcb1a5f30ef1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Oct 2017 19:43:56 +0800 Subject: sctp: fix a type cast warnings that causes a_rwnd gets the wrong value These warnings were found by running 'make C=2 M=net/sctp/'. Commit d4d6fb5787a6 ("sctp: Try not to change a_rwnd when faking a SACK from SHUTDOWN.") expected to use the peers old rwnd and add our flight size to the a_rwnd. But with the wrong Endian, it may not work as well as expected. So fix it by converting to the right value. Fixes: d4d6fb5787a6 ("sctp: Try not to change a_rwnd when faking a SACK from SHUTDOWN.") Reported-by: Eric Dumazet Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_sideeffect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index e6a2974e020e..8f2762bba879 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1680,8 +1680,8 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, case SCTP_CMD_PROCESS_CTSN: /* Dummy up a SACK for processing. */ sackh.cum_tsn_ack = cmd->obj.be32; - sackh.a_rwnd = asoc->peer.rwnd + - asoc->outqueue.outstanding_bytes; + sackh.a_rwnd = htonl(asoc->peer.rwnd + + asoc->outqueue.outstanding_bytes); sackh.num_gap_ack_blocks = 0; sackh.num_dup_tsns = 0; chunk->subh.sack_hdr = &sackh; -- cgit From 978aa0474115f3f5848949f2efce4def0766a5cb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Oct 2017 19:43:57 +0800 Subject: sctp: fix some type cast warnings introduced since very beginning These warnings were found by running 'make C=2 M=net/sctp/'. They are there since very beginning. Note after this patch, there still one warning left in sctp_outq_flush(): sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM) Since it has been moved to sctp_stream_outq_migrate on net-next, to avoid the extra job when merging net-next to net, I will post the fix for it after the merging is done. Reported-by: Eric Dumazet Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 2 +- net/sctp/sm_make_chunk.c | 4 ++-- net/sctp/sm_sideeffect.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 7fe9e1d1b7ec..a6dfa86c0201 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -738,7 +738,7 @@ static int sctp_v6_skb_iif(const struct sk_buff *skb) /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v6_is_ce(const struct sk_buff *skb) { - return *((__u32 *)(ipv6_hdr(skb))) & htonl(1 << 20); + return *((__u32 *)(ipv6_hdr(skb))) & (__force __u32)htonl(1 << 20); } /* Dump the v6 addr to the seq file. */ diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 57c55045f5a7..514465b03829 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2854,7 +2854,7 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = flags; param.param_hdr.length = htons(paramlen + addr_param_len); - param.crr_id = i; + param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, ¶m); sctp_addto_chunk(retval, addr_param_len, &addr_param); @@ -2867,7 +2867,7 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = SCTP_PARAM_DEL_IP; param.param_hdr.length = htons(paramlen + addr_param_len); - param.crr_id = i; + param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, ¶m); sctp_addto_chunk(retval, addr_param_len, &addr_param); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 8f2762bba879..e2d9a4b49c9c 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1607,12 +1607,12 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, break; case SCTP_CMD_INIT_FAILED: - sctp_cmd_init_failed(commands, asoc, cmd->obj.err); + sctp_cmd_init_failed(commands, asoc, cmd->obj.u32); break; case SCTP_CMD_ASSOC_FAILED: sctp_cmd_assoc_failed(commands, asoc, event_type, - subtype, chunk, cmd->obj.err); + subtype, chunk, cmd->obj.u32); break; case SCTP_CMD_INIT_COUNTER_INC: -- cgit From 7aa0045dadb6ef37485ea9f2a7d28278ca588b51 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:28 -0700 Subject: net_sched: introduce a workqueue for RCU callbacks of tc filter This patch introduces a dedicated workqueue for tc filters so that each tc filter's RCU callback could defer their action destroy work to this workqueue. The helper tcf_queue_work() is introduced for them to use. Because we hold RTNL lock when calling tcf_block_put(), we can not simply flush works inside it, therefore we have to defer it again to this workqueue and make sure all flying RCU callbacks have already queued their work before this one, in other words, to ensure this is the last one to execute to prevent any use-after-free. On the other hand, this makes tcf_block_put() ugly and harder to understand. Since David and Eric strongly dislike adding synchronize_rcu(), this is probably the only solution that could make everyone happy. Please also see the code comments below. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 68 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0b2219adf520..045d13679ad6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -77,6 +77,8 @@ out: } EXPORT_SYMBOL(register_tcf_proto_ops); +static struct workqueue_struct *tc_filter_wq; + int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { struct tcf_proto_ops *t; @@ -86,6 +88,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) * tcf_proto_ops's destroy() handler. */ rcu_barrier(); + flush_workqueue(tc_filter_wq); write_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { @@ -100,6 +103,12 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) } EXPORT_SYMBOL(unregister_tcf_proto_ops); +bool tcf_queue_work(struct work_struct *work) +{ + return queue_work(tc_filter_wq, work); +} +EXPORT_SYMBOL(tcf_queue_work); + /* Select new prio value from the range, managed by kernel. */ static inline u32 tcf_auto_prio(struct tcf_proto *tp) @@ -266,23 +275,30 @@ err_chain_create: } EXPORT_SYMBOL(tcf_block_get); -void tcf_block_put(struct tcf_block *block) +static void tcf_block_put_final(struct work_struct *work) { + struct tcf_block *block = container_of(work, struct tcf_block, work); struct tcf_chain *chain, *tmp; - if (!block) - return; - - /* XXX: Standalone actions are not allowed to jump to any chain, and - * bound actions should be all removed after flushing. However, - * filters are destroyed in RCU callbacks, we have to hold the chains - * first, otherwise we would always race with RCU callbacks on this list - * without proper locking. - */ + /* At this point, all the chains should have refcnt == 1. */ + rtnl_lock(); + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_put(chain); + rtnl_unlock(); + kfree(block); +} - /* Wait for existing RCU callbacks to cool down. */ - rcu_barrier(); +/* XXX: Standalone actions are not allowed to jump to any chain, and bound + * actions should be all removed after flushing. However, filters are destroyed + * in RCU callbacks, we have to hold the chains first, otherwise we would + * always race with RCU callbacks on this list without proper locking. + */ +static void tcf_block_put_deferred(struct work_struct *work) +{ + struct tcf_block *block = container_of(work, struct tcf_block, work); + struct tcf_chain *chain; + rtnl_lock(); /* Hold a refcnt for all chains, except 0, in case they are gone. */ list_for_each_entry(chain, &block->chain_list, list) if (chain->index) @@ -292,13 +308,27 @@ void tcf_block_put(struct tcf_block *block) list_for_each_entry(chain, &block->chain_list, list) tcf_chain_flush(chain); - /* Wait for RCU callbacks to release the reference count. */ + INIT_WORK(&block->work, tcf_block_put_final); + /* Wait for RCU callbacks to release the reference count and make + * sure their works have been queued before this. + */ rcu_barrier(); + tcf_queue_work(&block->work); + rtnl_unlock(); +} - /* At this point, all the chains should have refcnt == 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) - tcf_chain_put(chain); - kfree(block); +void tcf_block_put(struct tcf_block *block) +{ + if (!block) + return; + + INIT_WORK(&block->work, tcf_block_put_deferred); + /* Wait for existing RCU callbacks to cool down, make sure their works + * have been queued before this. We can not flush pending works here + * because we are holding the RTNL lock. + */ + rcu_barrier(); + tcf_queue_work(&block->work); } EXPORT_SYMBOL(tcf_block_put); @@ -1030,6 +1060,10 @@ EXPORT_SYMBOL(tcf_exts_get_dev); static int __init tc_filter_init(void) { + tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0); + if (!tc_filter_wq) + return -ENOMEM; + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, -- cgit From c96a48385d53089ee9977dd0bce82a9493984484 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:29 -0700 Subject: net_sched: use tcf_queue_work() in basic filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_basic.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index d89ebafd2239..f177649a2419 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -34,7 +34,10 @@ struct basic_filter { struct tcf_result res; struct tcf_proto *tp; struct list_head link; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -82,15 +85,26 @@ static int basic_init(struct tcf_proto *tp) return 0; } -static void basic_delete_filter(struct rcu_head *head) +static void basic_delete_filter_work(struct work_struct *work) { - struct basic_filter *f = container_of(head, struct basic_filter, rcu); + struct basic_filter *f = container_of(work, struct basic_filter, work); + rtnl_lock(); tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); + rtnl_unlock(); + kfree(f); } +static void basic_delete_filter(struct rcu_head *head) +{ + struct basic_filter *f = container_of(head, struct basic_filter, rcu); + + INIT_WORK(&f->work, basic_delete_filter_work); + tcf_queue_work(&f->work); +} + static void basic_destroy(struct tcf_proto *tp) { struct basic_head *head = rtnl_dereference(tp->root); -- cgit From e910af676b565ecc16bcd6c896ecb68157396ecc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:30 -0700 Subject: net_sched: use tcf_queue_work() in bpf filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 520c5027646a..037a3ae86829 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -49,7 +49,10 @@ struct cls_bpf_prog { struct sock_filter *bpf_ops; const char *bpf_name; struct tcf_proto *tp; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { @@ -257,9 +260,21 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) kfree(prog); } +static void cls_bpf_delete_prog_work(struct work_struct *work) +{ + struct cls_bpf_prog *prog = container_of(work, struct cls_bpf_prog, work); + + rtnl_lock(); + __cls_bpf_delete_prog(prog); + rtnl_unlock(); +} + static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu) { - __cls_bpf_delete_prog(container_of(rcu, struct cls_bpf_prog, rcu)); + struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu); + + INIT_WORK(&prog->work, cls_bpf_delete_prog_work); + tcf_queue_work(&prog->work); } static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) -- cgit From b1b5b04fdb6da262aef37ef83b9f2e41326720ef Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:31 -0700 Subject: net_sched: use tcf_queue_work() in cgroup filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_cgroup.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index d48452f87975..a97e069bee89 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -23,7 +23,10 @@ struct cls_cgroup_head { struct tcf_exts exts; struct tcf_ematch_tree ematches; struct tcf_proto *tp; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -57,15 +60,26 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; +static void cls_cgroup_destroy_work(struct work_struct *work) +{ + struct cls_cgroup_head *head = container_of(work, + struct cls_cgroup_head, + work); + rtnl_lock(); + tcf_exts_destroy(&head->exts); + tcf_em_tree_destroy(&head->ematches); + kfree(head); + rtnl_unlock(); +} + static void cls_cgroup_destroy_rcu(struct rcu_head *root) { struct cls_cgroup_head *head = container_of(root, struct cls_cgroup_head, rcu); - tcf_exts_destroy(&head->exts); - tcf_em_tree_destroy(&head->ematches); - kfree(head); + INIT_WORK(&head->work, cls_cgroup_destroy_work); + tcf_queue_work(&head->work); } static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, -- cgit From 94cdb47566b799649e996e1fb9de2a503dada763 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:32 -0700 Subject: net_sched: use tcf_queue_work() in flow filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_flow.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 2a3a60ec5b86..67f3a2af6aab 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -57,7 +57,10 @@ struct flow_filter { u32 divisor; u32 baseclass; u32 hashrnd; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; static inline u32 addr_fold(void *addr) @@ -369,14 +372,24 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; -static void flow_destroy_filter(struct rcu_head *head) +static void flow_destroy_filter_work(struct work_struct *work) { - struct flow_filter *f = container_of(head, struct flow_filter, rcu); + struct flow_filter *f = container_of(work, struct flow_filter, work); + rtnl_lock(); del_timer_sync(&f->perturb_timer); tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); kfree(f); + rtnl_unlock(); +} + +static void flow_destroy_filter(struct rcu_head *head) +{ + struct flow_filter *f = container_of(head, struct flow_filter, rcu); + + INIT_WORK(&f->work, flow_destroy_filter_work); + tcf_queue_work(&f->work); } static int flow_change(struct net *net, struct sk_buff *in_skb, -- cgit From 0552c8afa077889b4704ef5ee88b03063ad45023 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:33 -0700 Subject: net_sched: use tcf_queue_work() in flower filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index b480d7c792ba..5b5722c8b32c 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -87,7 +87,10 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; struct net_device *hw_dev; }; @@ -215,12 +218,22 @@ static int fl_init(struct tcf_proto *tp) return 0; } -static void fl_destroy_filter(struct rcu_head *head) +static void fl_destroy_filter_work(struct work_struct *work) { - struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu); + struct cls_fl_filter *f = container_of(work, struct cls_fl_filter, work); + rtnl_lock(); tcf_exts_destroy(&f->exts); kfree(f); + rtnl_unlock(); +} + +static void fl_destroy_filter(struct rcu_head *head) +{ + struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu); + + INIT_WORK(&f->work, fl_destroy_filter_work); + tcf_queue_work(&f->work); } static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) -- cgit From e071dff2a6beeccb6f9744f9a0251ab773ca2ab8 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:34 -0700 Subject: net_sched: use tcf_queue_work() in fw filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_fw.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 941245ad07fd..99183b8621ec 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -46,7 +46,10 @@ struct fw_filter { #endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; struct tcf_proto *tp; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; static u32 fw_hash(u32 handle) @@ -119,12 +122,22 @@ static int fw_init(struct tcf_proto *tp) return 0; } -static void fw_delete_filter(struct rcu_head *head) +static void fw_delete_filter_work(struct work_struct *work) { - struct fw_filter *f = container_of(head, struct fw_filter, rcu); + struct fw_filter *f = container_of(work, struct fw_filter, work); + rtnl_lock(); tcf_exts_destroy(&f->exts); kfree(f); + rtnl_unlock(); +} + +static void fw_delete_filter(struct rcu_head *head) +{ + struct fw_filter *f = container_of(head, struct fw_filter, rcu); + + INIT_WORK(&f->work, fw_delete_filter_work); + tcf_queue_work(&f->work); } static void fw_destroy(struct tcf_proto *tp) -- cgit From df2735ee8e6ca202a8630f237b59401a25193be1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:35 -0700 Subject: net_sched: use tcf_queue_work() in matchall filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index eeac606c95ab..c33f711b9019 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,7 +21,10 @@ struct cls_mall_head { struct tcf_result res; u32 handle; u32 flags; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -41,13 +44,23 @@ static int mall_init(struct tcf_proto *tp) return 0; } +static void mall_destroy_work(struct work_struct *work) +{ + struct cls_mall_head *head = container_of(work, struct cls_mall_head, + work); + rtnl_lock(); + tcf_exts_destroy(&head->exts); + kfree(head); + rtnl_unlock(); +} + static void mall_destroy_rcu(struct rcu_head *rcu) { struct cls_mall_head *head = container_of(rcu, struct cls_mall_head, rcu); - tcf_exts_destroy(&head->exts); - kfree(head); + INIT_WORK(&head->work, mall_destroy_work); + tcf_queue_work(&head->work); } static int mall_replace_hw_filter(struct tcf_proto *tp, -- cgit From c0d378ef1266546a39f2df00a56ff1f74166a2b7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:36 -0700 Subject: net_sched: use tcf_queue_work() in u32 filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 10b8d851fc6b..dadd1b344497 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -68,7 +68,10 @@ struct tc_u_knode { u32 __percpu *pcpu_success; #endif struct tcf_proto *tp; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; /* The 'sel' field MUST be the last field in structure to allow for * tc_u32_keys allocated at end of structure. */ @@ -418,11 +421,21 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, * this the u32_delete_key_rcu variant does not free the percpu * statistics. */ +static void u32_delete_key_work(struct work_struct *work) +{ + struct tc_u_knode *key = container_of(work, struct tc_u_knode, work); + + rtnl_lock(); + u32_destroy_key(key->tp, key, false); + rtnl_unlock(); +} + static void u32_delete_key_rcu(struct rcu_head *rcu) { struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); - u32_destroy_key(key->tp, key, false); + INIT_WORK(&key->work, u32_delete_key_work); + tcf_queue_work(&key->work); } /* u32_delete_key_freepf_rcu is the rcu callback variant @@ -432,11 +445,21 @@ static void u32_delete_key_rcu(struct rcu_head *rcu) * for the variant that should be used with keys return from * u32_init_knode() */ +static void u32_delete_key_freepf_work(struct work_struct *work) +{ + struct tc_u_knode *key = container_of(work, struct tc_u_knode, work); + + rtnl_lock(); + u32_destroy_key(key->tp, key, true); + rtnl_unlock(); +} + static void u32_delete_key_freepf_rcu(struct rcu_head *rcu) { struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); - u32_destroy_key(key->tp, key, true); + INIT_WORK(&key->work, u32_delete_key_freepf_work); + tcf_queue_work(&key->work); } static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) -- cgit From c2f3f31d402be4849b06282c3a5278f2865c9fcc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:37 -0700 Subject: net_sched: use tcf_queue_work() in route filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_route.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 9ddde65915d2..4b14ccd8b8f2 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -57,7 +57,10 @@ struct route4_filter { u32 handle; struct route4_bucket *bkt; struct tcf_proto *tp; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) @@ -254,12 +257,22 @@ static int route4_init(struct tcf_proto *tp) return 0; } -static void route4_delete_filter(struct rcu_head *head) +static void route4_delete_filter_work(struct work_struct *work) { - struct route4_filter *f = container_of(head, struct route4_filter, rcu); + struct route4_filter *f = container_of(work, struct route4_filter, work); + rtnl_lock(); tcf_exts_destroy(&f->exts); kfree(f); + rtnl_unlock(); +} + +static void route4_delete_filter(struct rcu_head *head) +{ + struct route4_filter *f = container_of(head, struct route4_filter, rcu); + + INIT_WORK(&f->work, route4_delete_filter_work); + tcf_queue_work(&f->work); } static void route4_destroy(struct tcf_proto *tp) -- cgit From d4f84a41dc615c166555cd332b0235bf6b9bcb4a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:38 -0700 Subject: net_sched: use tcf_queue_work() in rsvp filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_rsvp.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index b1f6ed48bc72..bdbc541787f8 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -97,7 +97,10 @@ struct rsvp_filter { u32 handle; struct rsvp_session *sess; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) @@ -282,12 +285,22 @@ static int rsvp_init(struct tcf_proto *tp) return -ENOBUFS; } -static void rsvp_delete_filter_rcu(struct rcu_head *head) +static void rsvp_delete_filter_work(struct work_struct *work) { - struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu); + struct rsvp_filter *f = container_of(work, struct rsvp_filter, work); + rtnl_lock(); tcf_exts_destroy(&f->exts); kfree(f); + rtnl_unlock(); +} + +static void rsvp_delete_filter_rcu(struct rcu_head *head) +{ + struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu); + + INIT_WORK(&f->work, rsvp_delete_filter_work); + tcf_queue_work(&f->work); } static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) -- cgit From 27ce4f05e2abbe2d3ec7434e456619a5178cd3bd Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:39 -0700 Subject: net_sched: use tcf_queue_work() in tcindex filter Defer the tcf_exts_destroy() in RCU callback to tc filter workqueue and get RTNL lock. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 14a7e08b2fa9..beaa95e09c25 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -27,14 +27,20 @@ struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; struct tcindex_filter { u16 key; struct tcindex_filter_result result; struct tcindex_filter __rcu *next; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; @@ -133,12 +139,34 @@ static int tcindex_init(struct tcf_proto *tp) return 0; } +static void tcindex_destroy_rexts_work(struct work_struct *work) +{ + struct tcindex_filter_result *r; + + r = container_of(work, struct tcindex_filter_result, work); + rtnl_lock(); + tcf_exts_destroy(&r->exts); + rtnl_unlock(); +} + static void tcindex_destroy_rexts(struct rcu_head *head) { struct tcindex_filter_result *r; r = container_of(head, struct tcindex_filter_result, rcu); - tcf_exts_destroy(&r->exts); + INIT_WORK(&r->work, tcindex_destroy_rexts_work); + tcf_queue_work(&r->work); +} + +static void tcindex_destroy_fexts_work(struct work_struct *work) +{ + struct tcindex_filter *f = container_of(work, struct tcindex_filter, + work); + + rtnl_lock(); + tcf_exts_destroy(&f->result.exts); + kfree(f); + rtnl_unlock(); } static void tcindex_destroy_fexts(struct rcu_head *head) @@ -146,8 +174,8 @@ static void tcindex_destroy_fexts(struct rcu_head *head) struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu); - tcf_exts_destroy(&f->result.exts); - kfree(f); + INIT_WORK(&f->work, tcindex_destroy_fexts_work); + tcf_queue_work(&f->work); } static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last) -- cgit From 2d132eba1d972ea6c0e47286e4c821b4a3c5b84d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:40 -0700 Subject: net_sched: add rtnl assertion to tcf_exts_destroy() After previous patches, it is now safe to claim that tcf_exts_destroy() is always called with RTNL lock. Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 045d13679ad6..231181c602ed 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -909,6 +909,7 @@ void tcf_exts_destroy(struct tcf_exts *exts) #ifdef CONFIG_NET_CLS_ACT LIST_HEAD(actions); + ASSERT_RTNL(); tcf_exts_to_list(exts, &actions); tcf_action_destroy(&actions, TCA_ACT_UNBIND); kfree(exts->actions); -- cgit From 46e235c15ca44f34cb79f4dbec909b8c51999dc1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:41 -0700 Subject: net_sched: fix call_rcu() race on act_sample module removal Similar to commit c78e1746d3ad ("net: sched: fix call_rcu() race on classifier module unloads"), we need to wait for flying RCU callback tcf_sample_cleanup_rcu(). Cc: Yotam Gigi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_sample.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index ec986ae52808..a9f9a2ccc664 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -264,6 +264,7 @@ static int __init sample_init_module(void) static void __exit sample_cleanup_module(void) { + rcu_barrier(); tcf_unregister_action(&act_sample_ops, &sample_net_ops); } -- cgit