From 014894360ec95abe868e94416b3dd6569f6e2c0c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 13 May 2019 07:19:19 -0700 Subject: bpf: sockmap, only stop/flush strp if it was enabled at some point If we try to call strp_done on a parser that has never been initialized, because the sockmap user is only using TX side for example we get the following error. [ 883.422081] WARNING: CPU: 1 PID: 208 at kernel/workqueue.c:3030 __flush_work+0x1ca/0x1e0 ... [ 883.422095] Workqueue: events sk_psock_destroy_deferred [ 883.422097] RIP: 0010:__flush_work+0x1ca/0x1e0 This had been wrapped in a 'if (psock->parser.enabled)' logic which was broken because the strp_done() was never actually being called because we do a strp_stop() earlier in the tear down logic will set parser.enabled to false. This could result in a use after free if work was still in the queue and was resolved by the patch here, 1d79895aef18f ("sk_msg: Always cancel strp work before freeing the psock"). However, calling strp_stop(), done by the patch marked in the fixes tag, only is useful if we never initialized a strp parser program and never initialized the strp to start with. Because if we had initialized a stream parser strp_stop() would have been called by sk_psock_drop() earlier in the tear down process. By forcing the strp to stop we get past the WARNING in strp_done that checks the stopped flag but calling cancel_work_sync on work that has never been initialized is also wrong and generates the warning above. To fix check if the parser program exists. If the program exists then the strp work has been initialized and must be sync'd and cancelled before free'ing any structures. If no program exists we never initialized the stream parser in the first place so skip the sync/cancel logic implemented by strp_done. Finally, remove the strp_done its not needed and in the case where we are using the stream parser has already been called. Fixes: e8e3437762ad9 ("bpf: Stop the psock parser before canceling its work") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/skmsg.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index cc94d921476c..49d1efa329d7 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -554,8 +554,10 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) struct sk_psock *psock = container_of(gc, struct sk_psock, gc); /* No sk_callback_lock since already detached. */ - strp_stop(&psock->parser.strp); - strp_done(&psock->parser.strp); + + /* Parser has been stopped */ + if (psock->progs.skb_parser) + strp_done(&psock->parser.strp); cancel_work_sync(&psock->work); -- cgit From c42253cc88206fd0e9868c8b2fd7f9e79f9e0e03 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 13 May 2019 07:19:37 -0700 Subject: bpf: sockmap remove duplicate queue free In tcp bpf remove we free the cork list and purge the ingress msg list. However we do this before the ref count reaches zero so it could be possible some other access is in progress. In this case (tcp close and/or tcp_unhash) we happen to also hold the sock lock so no path exists but lets fix it otherwise it is extremely fragile and breaks the reference counting rules. Also we already check the cork list and ingress msg queue and free them once the ref count reaches zero so its wasteful to check twice. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/ipv4/tcp_bpf.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 1bb7321a256d..4a619c85daed 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -528,8 +528,6 @@ static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock) { struct sk_psock_link *link; - sk_psock_cork_free(psock); - __sk_psock_purge_ingress_msg(psock); while ((link = sk_psock_link_pop(psock))) { sk_psock_unlink(sk, link); sk_psock_free_link(link); -- cgit From cabede8b4f2b746232aa25730a0b752de1cb82ca Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 13 May 2019 07:19:55 -0700 Subject: bpf: sockmap fix msg->sg.size account on ingress skb When converting a skb to msg->sg we forget to set the size after the latest ktls/tls code conversion. This patch can be reached by doing a redir into ingress path from BPF skb sock recv hook. Then trying to read the size fails. Fix this by setting the size. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/skmsg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 49d1efa329d7..93bffaad2135 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -411,6 +411,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) sk_mem_charge(sk, skb->len); copied = skb->len; msg->sg.start = 0; + msg->sg.size = copied; msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge; msg->skb = skb; -- cgit From cd736d8b67fb22a85a68c1ee8020eb0d660615ec Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 13 May 2019 10:32:05 -0700 Subject: tcp: fix retrans timestamp on passive Fast Open Commit c7d13c8faa74 ("tcp: properly track retry time on passive Fast Open") sets the start of SYNACK retransmission time on passive Fast Open in "retrans_stamp". However the timestamp is not reset upon the handshake has completed. As a result, future data packet retransmission may not update it in tcp_retransmit_skb(). This may lead to socket aborting earlier unexpectedly by retransmits_timed_out() since retrans_stamp remains the SYNACK rtx time. This bug only manifests on passive TFO sender that a) suffered SYNACK timeout and then b) stalls on very first loss recovery. Any successful loss recovery would reset the timestamp to avoid this issue. Fixes: c7d13c8faa74 ("tcp: properly track retry time on passive Fast Open") Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 20f6fac5882e..c61edd023b35 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6024,6 +6024,9 @@ reset_and_undo: static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { tcp_try_undo_loss(sk, false); + + /* Reset rtx states to prevent spurious retransmits_timed_out() */ + tcp_sk(sk)->retrans_stamp = 0; inet_csk(sk)->icsk_retransmits = 0; /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, -- cgit From feadc4b6cf42a53a8a93c918a569a0b7e62bd350 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 14 May 2019 15:12:19 +0200 Subject: rtnetlink: always put IFLA_LINK for links with a link-netnsid Currently, nla_put_iflink() doesn't put the IFLA_LINK attribute when iflink == ifindex. In some cases, a device can be created in a different netns with the same ifindex as its parent. That device will not dump its IFLA_LINK attribute, which can confuse some userspace software that expects it. For example, if the last ifindex created in init_net and foo are both 8, these commands will trigger the issue: ip link add parent type dummy # ifindex 9 ip link add link parent netns foo type macvlan # ifindex 9 in ns foo So, in case a device puts the IFLA_LINK_NETNSID attribute in a dump, always put the IFLA_LINK attribute as well. Thanks to Dan Winship for analyzing the original OpenShift bug down to the missing netlink attribute. v2: change Fixes tag, it's been here forever, as Nicolas Dichtel said add Nicolas' ack v3: change Fixes tag fix subject typo, spotted by Edward Cree Analyzed-by: Dan Winship Fixes: d8a5ec672768 ("[NET]: netlink support for moving devices between network namespaces.") Signed-off-by: Sabrina Dubroca Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2bd12afb9297..adcc045952c2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1496,14 +1496,15 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) return ret; } -static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev) +static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, + bool force) { int ifindex = dev_get_iflink(dev); - if (dev->ifindex == ifindex) - return 0; + if (force || dev->ifindex != ifindex) + return nla_put_u32(skb, IFLA_LINK, ifindex); - return nla_put_u32(skb, IFLA_LINK, ifindex); + return 0; } static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, @@ -1520,6 +1521,8 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev, struct net *src_net) { + bool put_iflink = false; + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); @@ -1528,10 +1531,12 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb, if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; + + put_iflink = true; } } - return 0; + return nla_put_iflink(skb, dev, put_iflink); } static int rtnl_fill_link_af(struct sk_buff *skb, @@ -1617,7 +1622,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif - nla_put_iflink(skb, dev) || put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (dev->qdisc && -- cgit From 858f5017446764e8bca0b29589a3b164186ae471 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 May 2019 09:10:15 -0700 Subject: tcp: do not recycle cloned skbs It is illegal to change arbitrary fields in skb_shared_info if the skb is cloned. Before calling skb_zcopy_clear() we need to ensure this rule, therefore we need to move the test from sk_stream_alloc_skb() to sk_wmem_free_skb() Fixes: 4f661542a402 ("tcp: fix zerocopy and notsent_lowat issues") Signed-off-by: Eric Dumazet Diagnosed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1fa15beb8380..53d61ca3ac4b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -855,7 +855,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, if (likely(!size)) { skb = sk->sk_tx_skb_cache; - if (skb && !skb_cloned(skb)) { + if (skb) { skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); sk->sk_tx_skb_cache = NULL; pskb_trim(skb, 0); -- cgit From 5fa2ca7c4a3fc176f31b495e1a704862d8188b53 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 13 May 2019 21:42:03 -0700 Subject: bpf, tcp: correctly handle DONT_WAIT flags and timeo == 0 The tcp_bpf_wait_data() routine needs to check timeo != 0 before calling sk_wait_event() otherwise we may see unexpected stalls on receiver. Arika did all the leg work here I just formatted, posted and ran a few tests. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: Arika Chen Suggested-by: Arika Chen Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/ipv4/tcp_bpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 4a619c85daed..3d1e15401384 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -27,7 +27,10 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, int flags, long timeo, int *err) { DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret; + int ret = 0; + + if (!timeo) + return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); -- cgit From bae9ed69029c7d499c57485593b2faae475fd704 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 14 May 2019 21:18:12 +0100 Subject: flow_offload: support CVLAN match Plumb it through from the flow_dissector. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/flow_offload.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index c3a00eac4804..5ce7d47a960e 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -54,6 +54,13 @@ void flow_rule_match_vlan(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_vlan); +void flow_rule_match_cvlan(const struct flow_rule *rule, + struct flow_match_vlan *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CVLAN, out); +} +EXPORT_SYMBOL(flow_rule_match_cvlan); + void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out) { -- cgit From a9eeb998c28d5506616426bd3a216bd5735a18b8 Mon Sep 17 00:00:00 2001 From: Sunil Muthuswamy Date: Wed, 15 May 2019 00:56:05 +0000 Subject: hv_sock: Add support for delayed close Currently, hvsock does not implement any delayed or background close logic. Whenever the hvsock socket is closed, a FIN is sent to the peer, and the last reference to the socket is dropped, which leads to a call to .destruct where the socket can hang indefinitely waiting for the peer to close it's side. The can cause the user application to hang in the close() call. This change implements proper STREAM(TCP) closing handshake mechanism by sending the FIN to the peer and the waiting for the peer's FIN to arrive for a given timeout. On timeout, it will try to terminate the connection (i.e. a RST). This is in-line with other socket providers such as virtio. This change does not address the hang in the vmbus_hvsock_device_unregister where it waits indefinitely for the host to rescind the channel. That should be taken up as a separate fix. Signed-off-by: Sunil Muthuswamy Reviewed-by: Dexuan Cui Signed-off-by: David S. Miller --- net/vmw_vsock/hyperv_transport.c | 108 ++++++++++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index a827547aa102..982a8dc49e03 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -35,6 +35,9 @@ /* The MTU is 16KB per the host side's design */ #define HVS_MTU_SIZE (1024 * 16) +/* How long to wait for graceful shutdown of a connection */ +#define HVS_CLOSE_TIMEOUT (8 * HZ) + struct vmpipe_proto_header { u32 pkt_type; u32 data_size; @@ -305,19 +308,32 @@ static void hvs_channel_cb(void *ctx) sk->sk_write_space(sk); } -static void hvs_close_connection(struct vmbus_channel *chan) +static void hvs_do_close_lock_held(struct vsock_sock *vsk, + bool cancel_timeout) { - struct sock *sk = get_per_channel_state(chan); - struct vsock_sock *vsk = vsock_sk(sk); - - lock_sock(sk); + struct sock *sk = sk_vsock(vsk); - sk->sk_state = TCP_CLOSE; sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; - + vsk->peer_shutdown = SHUTDOWN_MASK; + if (vsock_stream_has_data(vsk) <= 0) + sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); + if (vsk->close_work_scheduled && + (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { + vsk->close_work_scheduled = false; + vsock_remove_sock(vsk); + /* Release the reference taken while scheduling the timeout */ + sock_put(sk); + } +} + +static void hvs_close_connection(struct vmbus_channel *chan) +{ + struct sock *sk = get_per_channel_state(chan); + + lock_sock(sk); + hvs_do_close_lock_held(vsock_sk(sk), true); release_sock(sk); } @@ -452,50 +468,80 @@ static int hvs_connect(struct vsock_sock *vsk) return vmbus_send_tl_connect_request(&h->vm_srv_id, &h->host_srv_id); } +static void hvs_shutdown_lock_held(struct hvsock *hvs, int mode) +{ + struct vmpipe_proto_header hdr; + + if (hvs->fin_sent || !hvs->chan) + return; + + /* It can't fail: see hvs_channel_writable_bytes(). */ + (void)hvs_send_data(hvs->chan, (struct hvs_send_buf *)&hdr, 0); + hvs->fin_sent = true; +} + static int hvs_shutdown(struct vsock_sock *vsk, int mode) { struct sock *sk = sk_vsock(vsk); - struct vmpipe_proto_header hdr; - struct hvs_send_buf *send_buf; - struct hvsock *hvs; if (!(mode & SEND_SHUTDOWN)) return 0; lock_sock(sk); + hvs_shutdown_lock_held(vsk->trans, mode); + release_sock(sk); + return 0; +} - hvs = vsk->trans; - if (hvs->fin_sent) - goto out; - - send_buf = (struct hvs_send_buf *)&hdr; +static void hvs_close_timeout(struct work_struct *work) +{ + struct vsock_sock *vsk = + container_of(work, struct vsock_sock, close_work.work); + struct sock *sk = sk_vsock(vsk); - /* It can't fail: see hvs_channel_writable_bytes(). */ - (void)hvs_send_data(hvs->chan, send_buf, 0); + sock_hold(sk); + lock_sock(sk); + if (!sock_flag(sk, SOCK_DONE)) + hvs_do_close_lock_held(vsk, false); - hvs->fin_sent = true; -out: + vsk->close_work_scheduled = false; release_sock(sk); - return 0; + sock_put(sk); } -static void hvs_release(struct vsock_sock *vsk) +/* Returns true, if it is safe to remove socket; false otherwise */ +static bool hvs_close_lock_held(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); - struct hvsock *hvs = vsk->trans; - struct vmbus_channel *chan; - lock_sock(sk); + if (!(sk->sk_state == TCP_ESTABLISHED || + sk->sk_state == TCP_CLOSING)) + return true; - sk->sk_state = TCP_CLOSING; - vsock_remove_sock(vsk); + if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) + hvs_shutdown_lock_held(vsk->trans, SHUTDOWN_MASK); - release_sock(sk); + if (sock_flag(sk, SOCK_DONE)) + return true; - chan = hvs->chan; - if (chan) - hvs_shutdown(vsk, RCV_SHUTDOWN | SEND_SHUTDOWN); + /* This reference will be dropped by the delayed close routine */ + sock_hold(sk); + INIT_DELAYED_WORK(&vsk->close_work, hvs_close_timeout); + vsk->close_work_scheduled = true; + schedule_delayed_work(&vsk->close_work, HVS_CLOSE_TIMEOUT); + return false; +} +static void hvs_release(struct vsock_sock *vsk) +{ + struct sock *sk = sk_vsock(vsk); + bool remove_sock; + + lock_sock(sk); + remove_sock = hvs_close_lock_held(vsk); + release_sock(sk); + if (remove_sock) + vsock_remove_sock(vsk); } static void hvs_destruct(struct vsock_sock *vsk) -- cgit From 752beb5ec4413d40434957e427c6c48d5043f805 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 15 May 2019 14:40:52 +0300 Subject: net: bpfilter: fallback to netfilter if failed to load bpfilter kernel module If bpfilter is not available return ENOPROTOOPT to fallback to netfilter. Function request_module() returns both errors and userspace exit codes. Just ignore them. Rechecking bpfilter_ops is enough. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/ipv4/bpfilter/sockopt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 15427163a041..0480918bfc7c 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -30,13 +30,11 @@ static int bpfilter_mbox_request(struct sock *sk, int optname, mutex_lock(&bpfilter_ops.lock); if (!bpfilter_ops.sockopt) { mutex_unlock(&bpfilter_ops.lock); - err = request_module("bpfilter"); + request_module("bpfilter"); mutex_lock(&bpfilter_ops.lock); - if (err) - goto out; if (!bpfilter_ops.sockopt) { - err = -ECHILD; + err = -ENOPROTOOPT; goto out; } } -- cgit From 61fb0d01680771f72cc9d39783fb2c122aaad51e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 May 2019 19:39:52 -0700 Subject: ipv6: prevent possible fib6 leaks At ipv6 route dismantle, fib6_drop_pcpu_from() is responsible for finding all percpu routes and set their ->from pointer to NULL, so that fib6_ref can reach its expected value (1). The problem right now is that other cpus can still catch the route being deleted, since there is no rcu grace period between the route deletion and call to fib6_drop_pcpu_from() This can leak the fib6 and associated resources, since no notifier will take care of removing the last reference(s). I decided to add another boolean (fib6_destroying) instead of reusing/renaming exception_bucket_flushed to ease stable backports, and properly document the memory barriers used to implement this fix. This patch has been co-developped with Wei Wang. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Wei Wang Cc: David Ahern Cc: Martin Lau Acked-by: Wei Wang Acked-by: Martin KaFai Lau Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 12 +++++++++--- net/ipv6/route.c | 7 +++++++ 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 08e0390e001c..008421b550c6 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -904,6 +904,12 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, { int cpu; + /* Make sure rt6_make_pcpu_route() wont add other percpu routes + * while we are cleaning them here. + */ + f6i->fib6_destroying = 1; + mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + /* release the reference to this fib entry from * all of its cached pcpu routes */ @@ -927,6 +933,9 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, { struct fib6_table *table = rt->fib6_table; + if (rt->rt6i_pcpu) + fib6_drop_pcpu_from(rt, table); + if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, @@ -948,9 +957,6 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } - - if (rt->rt6i_pcpu) - fib6_drop_pcpu_from(rt, table); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 23a20d62daac..27c0cc5d9d30 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1295,6 +1295,13 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); + if (res->f6i->fib6_destroying) { + struct fib6_info *from; + + from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); + fib6_info_release(from); + } + return pcpu_rt; } -- cgit From 7e27e8d6130c5e88fac9ddec4249f7f2337fe7f8 Mon Sep 17 00:00:00 2001 From: Junwei Hu Date: Thu, 16 May 2019 10:51:15 +0800 Subject: tipc: switch order of device registration to fix a crash When tipc is loaded while many processes try to create a TIPC socket, a crash occurs: PANIC: Unable to handle kernel paging request at virtual address "dfff20000000021d" pc : tipc_sk_create+0x374/0x1180 [tipc] lr : tipc_sk_create+0x374/0x1180 [tipc] Exception class = DABT (current EL), IL = 32 bits Call trace: tipc_sk_create+0x374/0x1180 [tipc] __sock_create+0x1cc/0x408 __sys_socket+0xec/0x1f0 __arm64_sys_socket+0x74/0xa8 ... This is due to race between sock_create and unfinished register_pernet_device. tipc_sk_insert tries to do "net_generic(net, tipc_net_id)". but tipc_net_id is not initialized yet. So switch the order of the two to close the race. This can be reproduced with multiple processes doing socket(AF_TIPC, ...) and one process doing module removal. Fixes: a62fbccecd62 ("tipc: make subscriber server support net namespace") Signed-off-by: Junwei Hu Reported-by: Wang Wang Reviewed-by: Xiaogang Wang Signed-off-by: David S. Miller --- net/tipc/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/tipc/core.c b/net/tipc/core.c index 27cccd101ef6..ddd2e0f67c07 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -131,10 +131,6 @@ static int __init tipc_init(void) if (err) goto out_netlink_compat; - err = tipc_socket_init(); - if (err) - goto out_socket; - err = tipc_register_sysctl(); if (err) goto out_sysctl; @@ -143,6 +139,10 @@ static int __init tipc_init(void) if (err) goto out_pernet; + err = tipc_socket_init(); + if (err) + goto out_socket; + err = tipc_bearer_setup(); if (err) goto out_bearer; @@ -150,12 +150,12 @@ static int __init tipc_init(void) pr_info("Started in single node mode\n"); return 0; out_bearer: + tipc_socket_stop(); +out_socket: unregister_pernet_subsys(&tipc_net_ops); out_pernet: tipc_unregister_sysctl(); out_sysctl: - tipc_socket_stop(); -out_socket: tipc_netlink_compat_stop(); out_netlink_compat: tipc_netlink_stop(); @@ -167,10 +167,10 @@ out_netlink: static void __exit tipc_exit(void) { tipc_bearer_cleanup(); + tipc_socket_stop(); unregister_pernet_subsys(&tipc_net_ops); tipc_netlink_stop(); tipc_netlink_compat_stop(); - tipc_socket_stop(); tipc_unregister_sysctl(); pr_info("Deactivated\n"); -- cgit From 858e5400e682370d0f8d217fcd6fd6ae5b256f5f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 16 May 2019 11:28:16 +0200 Subject: xfrm: ressurrect "Fix uninitialized memory read in _decode_session4" This resurrects commit 8742dc86d0c7a9628 ("xfrm4: Fix uninitialized memory read in _decode_session4"), which got lost during a merge conflict resolution between ipsec-next and net-next tree. c53ac41e3720 ("xfrm: remove decode_session indirection from afinfo_policy") in ipsec-next moved the (buggy) _decode_session4 from net/ipv4/xfrm4_policy.c to net/xfrm/xfrm_policy.c. In mean time, 8742dc86d0c7a was applied to ipsec.git and fixed the problem in the "old" location. When the trees got merged, the moved, old function was kept. This applies the "lost" commit again, to the new location. Fixes: a658a3f2ecbab ("Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next") Reported-by: Stephen Rothwell Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 410233c5681e..7a43ae6b2a44 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3264,7 +3264,8 @@ static void decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) { const struct iphdr *iph = ip_hdr(skb); - u8 *xprth = skb_network_header(skb) + iph->ihl * 4; + int ihl = iph->ihl; + u8 *xprth = skb_network_header(skb) + ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; @@ -3275,6 +3276,11 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) fl4->flowi4_mark = skb->mark; fl4->flowi4_oif = reverse ? skb->skb_iif : oif; + fl4->flowi4_proto = iph->protocol; + fl4->daddr = reverse ? iph->saddr : iph->daddr; + fl4->saddr = reverse ? iph->daddr : iph->saddr; + fl4->flowi4_tos = iph->tos; + if (!ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_UDP: @@ -3286,7 +3292,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ports; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; ports = (__be16 *)xprth; fl4->fl4_sport = ports[!!reverse]; @@ -3298,7 +3304,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) pskb_may_pull(skb, xprth + 2 - skb->data)) { u8 *icmp; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; icmp = xprth; fl4->fl4_icmp_type = icmp[0]; @@ -3310,7 +3316,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) pskb_may_pull(skb, xprth + 4 - skb->data)) { __be32 *ehdr; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; ehdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ehdr[0]; @@ -3321,7 +3327,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) pskb_may_pull(skb, xprth + 8 - skb->data)) { __be32 *ah_hdr; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; ah_hdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ah_hdr[1]; @@ -3332,7 +3338,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ipcomp_hdr; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; ipcomp_hdr = (__be16 *)xprth; fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); @@ -3344,7 +3350,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) __be16 *greflags; __be32 *gre_hdr; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; greflags = (__be16 *)xprth; gre_hdr = (__be32 *)xprth; @@ -3360,10 +3366,6 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) break; } } - fl4->flowi4_proto = iph->protocol; - fl4->daddr = reverse ? iph->saddr : iph->daddr; - fl4->saddr = reverse ? iph->daddr : iph->saddr; - fl4->flowi4_tos = iph->tos; } #if IS_ENABLED(CONFIG_IPV6) -- cgit From d7c04b05c9ca14c55309eb139430283a45c4c25f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 May 2019 08:09:57 -0700 Subject: net: avoid weird emergency message When host is under high stress, it is very possible thread running netdev_wait_allrefs() returns from msleep(250) 10 seconds late. This leads to these messages in the syslog : [...] unregister_netdevice: waiting for syz_tun to become free. Usage count = 0 If the device refcount is zero, the wait is over. Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 108ac8137b9b..b6b8505cfb3e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8927,7 +8927,7 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (time_after(jiffies, warning_time + 10 * HZ)) { + if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); warning_time = jiffies; -- cgit From 510e2ceda031eed97a7a0f9aad65d271a58b460d Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 16 May 2019 13:30:54 -0700 Subject: ipv6: fix src addr routing with the exception table When inserting route cache into the exception table, the key is generated with both src_addr and dest_addr with src addr routing. However, current logic always assumes the src_addr used to generate the key is a /128 host address. This is not true in the following scenarios: 1. When the route is a gateway route or does not have next hop. (rt6_is_gw_or_nonexthop() == false) 2. When calling ip6_rt_cache_alloc(), saddr is passed in as NULL. This means, when looking for a route cache in the exception table, we have to do the lookup twice: first time with the passed in /128 host address, second time with the src_addr stored in fib6_info. This solves the pmtu discovery issue reported by Mikael Magnusson where a route cache with a lower mtu info is created for a gateway route with src addr. However, the lookup code is not able to find this route cache. Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Reported-by: Mikael Magnusson Bisected-by: David Ahern Signed-off-by: Wei Wang Cc: Martin Lau Cc: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 27c0cc5d9d30..7a014ca877ed 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -111,8 +111,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, int iif, int type, u32 portid, u32 seq, unsigned int flags); static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, - struct in6_addr *daddr, - struct in6_addr *saddr); + const struct in6_addr *daddr, + const struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO static struct fib6_info *rt6_add_route_info(struct net *net, @@ -1573,31 +1573,44 @@ out: * Caller has to hold rcu_read_lock() */ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, - struct in6_addr *daddr, - struct in6_addr *saddr) + const struct in6_addr *daddr, + const struct in6_addr *saddr) { + const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; struct rt6_info *ret = NULL; - bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); - #ifdef CONFIG_IPV6_SUBTREES /* fib6i_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. - * Otherwise, the exception table is indexed by - * a hash of only fib6_dst. + * However, the src addr used to create the hash + * might not be exactly the passed in saddr which + * is a /128 addr from the flow. + * So we need to use f6i->fib6_src to redo lookup + * if the passed in saddr does not find anything. + * (See the logic in ip6_rt_cache_alloc() on how + * rt->rt6i_src is updated.) */ if (res->f6i->fib6_src.plen) src_key = saddr; +find_ex: #endif + bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) ret = rt6_ex->rt6i; +#ifdef CONFIG_IPV6_SUBTREES + /* Use fib6_src as src_key and redo lookup */ + if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { + src_key = &res->f6i->fib6_src.addr; + goto find_ex; + } +#endif + return ret; } @@ -2672,12 +2685,10 @@ u32 ip6_mtu_from_fib6(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { - struct rt6_exception_bucket *bucket; const struct fib6_nh *nh = res->nh; struct fib6_info *f6i = res->f6i; - const struct in6_addr *src_key; - struct rt6_exception *rt6_ex; struct inet6_dev *idev; + struct rt6_info *rt; u32 mtu = 0; if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { @@ -2686,18 +2697,10 @@ u32 ip6_mtu_from_fib6(const struct fib6_result *res, goto out; } - src_key = NULL; -#ifdef CONFIG_IPV6_SUBTREES - if (f6i->fib6_src.plen) - src_key = saddr; -#endif - - bucket = rcu_dereference(f6i->rt6i_exception_bucket); - rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); - if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) - mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); - - if (likely(!mtu)) { + rt = rt6_find_cached_rt(res, daddr, saddr); + if (unlikely(rt)) { + mtu = dst_metric_raw(&rt->dst, RTAX_MTU); + } else { struct net_device *dev = nh->fib_nh_dev; mtu = IPV6_MIN_MTU; -- cgit From 34dcf6a1902ac214149a2742250ff03aa5346f3e Mon Sep 17 00:00:00 2001 From: swkhack Date: Fri, 17 May 2019 15:59:22 +0800 Subject: net: caif: fix the value of size argument of snprintf Because the function snprintf write at most size bytes(including the null byte).So the value of the argument size need not to minus one. Signed-off-by: swkhack Signed-off-by: David S. Miller --- net/caif/cfdbgl.c | 2 +- net/caif/cfdgml.c | 3 +-- net/caif/cfutill.c | 2 +- net/caif/cfveil.c | 2 +- net/caif/cfvidl.c | 2 +- 5 files changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c index 7aae0b56829e..cce839bf49f7 100644 --- a/net/caif/cfdbgl.c +++ b/net/caif/cfdbgl.c @@ -26,7 +26,7 @@ struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info) cfsrvl_init(dbg, channel_id, dev_info, false); dbg->layer.receive = cfdbgl_receive; dbg->layer.transmit = cfdbgl_transmit; - snprintf(dbg->layer.name, CAIF_LAYER_NAME_SZ - 1, "dbg%d", channel_id); + snprintf(dbg->layer.name, CAIF_LAYER_NAME_SZ, "dbg%d", channel_id); return &dbg->layer; } diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c index 3bdddb32d55a..58fdb99a390f 100644 --- a/net/caif/cfdgml.c +++ b/net/caif/cfdgml.c @@ -33,8 +33,7 @@ struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info) cfsrvl_init(dgm, channel_id, dev_info, true); dgm->layer.receive = cfdgml_receive; dgm->layer.transmit = cfdgml_transmit; - snprintf(dgm->layer.name, CAIF_LAYER_NAME_SZ - 1, "dgm%d", channel_id); - dgm->layer.name[CAIF_LAYER_NAME_SZ - 1] = '\0'; + snprintf(dgm->layer.name, CAIF_LAYER_NAME_SZ, "dgm%d", channel_id); return &dgm->layer; } diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c index 1728fa4471cf..be7c43a92ead 100644 --- a/net/caif/cfutill.c +++ b/net/caif/cfutill.c @@ -33,7 +33,7 @@ struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info) cfsrvl_init(util, channel_id, dev_info, true); util->layer.receive = cfutill_receive; util->layer.transmit = cfutill_transmit; - snprintf(util->layer.name, CAIF_LAYER_NAME_SZ - 1, "util1"); + snprintf(util->layer.name, CAIF_LAYER_NAME_SZ, "util1"); return &util->layer; } diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c index 262224581efa..35dd3a600dd1 100644 --- a/net/caif/cfveil.c +++ b/net/caif/cfveil.c @@ -32,7 +32,7 @@ struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info) cfsrvl_init(vei, channel_id, dev_info, true); vei->layer.receive = cfvei_receive; vei->layer.transmit = cfvei_transmit; - snprintf(vei->layer.name, CAIF_LAYER_NAME_SZ - 1, "vei%d", channel_id); + snprintf(vei->layer.name, CAIF_LAYER_NAME_SZ, "vei%d", channel_id); return &vei->layer; } diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c index b3b110e8a350..73615e3b3b58 100644 --- a/net/caif/cfvidl.c +++ b/net/caif/cfvidl.c @@ -29,7 +29,7 @@ struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info) cfsrvl_init(vid, channel_id, dev_info, false); vid->layer.receive = cfvidl_receive; vid->layer.transmit = cfvidl_transmit; - snprintf(vid->layer.name, CAIF_LAYER_NAME_SZ - 1, "vid1"); + snprintf(vid->layer.name, CAIF_LAYER_NAME_SZ, "vid1"); return &vid->layer; } -- cgit From 532b0f7ece4cb2ffd24dc723ddf55242d1188e5e Mon Sep 17 00:00:00 2001 From: Junwei Hu Date: Fri, 17 May 2019 19:27:34 +0800 Subject: tipc: fix modprobe tipc failed after switch order of device registration Error message printed: modprobe: ERROR: could not insert 'tipc': Address family not supported by protocol. when modprobe tipc after the following patch: switch order of device registration, commit 7e27e8d6130c ("tipc: switch order of device registration to fix a crash") Because sock_create_kern(net, AF_TIPC, ...) is called by tipc_topsrv_create_listener() in the initialization process of tipc_net_ops, tipc_socket_init() must be execute before that. I move tipc_socket_init() into function tipc_init_net(). Fixes: 7e27e8d6130c ("tipc: switch order of device registration to fix a crash") Signed-off-by: Junwei Hu Reported-by: Wang Wang Reviewed-by: Kang Zhou Reviewed-by: Suanming Mou Signed-off-by: David S. Miller --- net/tipc/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/tipc/core.c b/net/tipc/core.c index ddd2e0f67c07..7d05d6823545 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -68,6 +68,10 @@ static int __net_init tipc_init_net(struct net *net) INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); + err = tipc_socket_init(); + if (err) + goto out_socket; + err = tipc_sk_rht_init(net); if (err) goto out_sk_rht; @@ -94,6 +98,8 @@ out_subscr: out_nametbl: tipc_sk_rht_destroy(net); out_sk_rht: + tipc_socket_stop(); +out_socket: return err; } @@ -104,6 +110,7 @@ static void __net_exit tipc_exit_net(struct net *net) tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); + tipc_socket_stop(); } static struct pernet_operations tipc_net_ops = { @@ -139,10 +146,6 @@ static int __init tipc_init(void) if (err) goto out_pernet; - err = tipc_socket_init(); - if (err) - goto out_socket; - err = tipc_bearer_setup(); if (err) goto out_bearer; @@ -150,8 +153,6 @@ static int __init tipc_init(void) pr_info("Started in single node mode\n"); return 0; out_bearer: - tipc_socket_stop(); -out_socket: unregister_pernet_subsys(&tipc_net_ops); out_pernet: tipc_unregister_sysctl(); @@ -167,7 +168,6 @@ out_netlink: static void __exit tipc_exit(void) { tipc_bearer_cleanup(); - tipc_socket_stop(); unregister_pernet_subsys(&tipc_net_ops); tipc_netlink_stop(); tipc_netlink_compat_stop(); -- cgit From ac03046ece2b158ebd204dfc4896fd9f39f0e6c8 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 17 May 2019 16:45:43 +0200 Subject: vsock/virtio: free packets during the socket release When the socket is released, we should free all packets queued in the per-socket list in order to avoid a memory leak. Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 602715fc9a75..f3f3d06cb6d8 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -786,12 +786,19 @@ static bool virtio_transport_close(struct vsock_sock *vsk) void virtio_transport_release(struct vsock_sock *vsk) { + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt, *tmp; struct sock *sk = &vsk->sk; bool remove_sock = true; lock_sock(sk); if (sk->sk_type == SOCK_STREAM) remove_sock = virtio_transport_close(vsk); + + list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } release_sock(sk); if (remove_sock) -- cgit From 5593530e56943182ebb6d81eca8a3be6db6dbba4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 17 May 2019 12:15:05 -0700 Subject: Revert "tipc: fix modprobe tipc failed after switch order of device registration" This reverts commit 532b0f7ece4cb2ffd24dc723ddf55242d1188e5e. More revisions coming up. Signed-off-by: David S. Miller --- net/tipc/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/tipc/core.c b/net/tipc/core.c index 7d05d6823545..ddd2e0f67c07 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -68,10 +68,6 @@ static int __net_init tipc_init_net(struct net *net) INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); - err = tipc_socket_init(); - if (err) - goto out_socket; - err = tipc_sk_rht_init(net); if (err) goto out_sk_rht; @@ -98,8 +94,6 @@ out_subscr: out_nametbl: tipc_sk_rht_destroy(net); out_sk_rht: - tipc_socket_stop(); -out_socket: return err; } @@ -110,7 +104,6 @@ static void __net_exit tipc_exit_net(struct net *net) tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); - tipc_socket_stop(); } static struct pernet_operations tipc_net_ops = { @@ -146,6 +139,10 @@ static int __init tipc_init(void) if (err) goto out_pernet; + err = tipc_socket_init(); + if (err) + goto out_socket; + err = tipc_bearer_setup(); if (err) goto out_bearer; @@ -153,6 +150,8 @@ static int __init tipc_init(void) pr_info("Started in single node mode\n"); return 0; out_bearer: + tipc_socket_stop(); +out_socket: unregister_pernet_subsys(&tipc_net_ops); out_pernet: tipc_unregister_sysctl(); @@ -168,6 +167,7 @@ out_netlink: static void __exit tipc_exit(void) { tipc_bearer_cleanup(); + tipc_socket_stop(); unregister_pernet_subsys(&tipc_net_ops); tipc_netlink_stop(); tipc_netlink_compat_stop(); -- cgit From ba95e5dfd36647622d8897a2a0470dde60e59ffd Mon Sep 17 00:00:00 2001 From: "Jorge E. Moreira" Date: Thu, 16 May 2019 13:51:07 -0700 Subject: vsock/virtio: Initialize core virtio vsock before registering the driver Avoid a race in which static variables in net/vmw_vsock/af_vsock.c are accessed (while handling interrupts) before they are initialized. [ 4.201410] BUG: unable to handle kernel paging request at ffffffffffffffe8 [ 4.207829] IP: vsock_addr_equals_addr+0x3/0x20 [ 4.211379] PGD 28210067 P4D 28210067 PUD 28212067 PMD 0 [ 4.211379] Oops: 0000 [#1] PREEMPT SMP PTI [ 4.211379] Modules linked in: [ 4.211379] CPU: 1 PID: 30 Comm: kworker/1:1 Not tainted 4.14.106-419297-gd7e28cc1f241 #1 [ 4.211379] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 4.211379] Workqueue: virtio_vsock virtio_transport_rx_work [ 4.211379] task: ffffa3273d175280 task.stack: ffffaea1800e8000 [ 4.211379] RIP: 0010:vsock_addr_equals_addr+0x3/0x20 [ 4.211379] RSP: 0000:ffffaea1800ebd28 EFLAGS: 00010286 [ 4.211379] RAX: 0000000000000002 RBX: 0000000000000000 RCX: ffffffffb94e42f0 [ 4.211379] RDX: 0000000000000400 RSI: ffffffffffffffe0 RDI: ffffaea1800ebdd0 [ 4.211379] RBP: ffffaea1800ebd58 R08: 0000000000000001 R09: 0000000000000001 [ 4.211379] R10: 0000000000000000 R11: ffffffffb89d5d60 R12: ffffaea1800ebdd0 [ 4.211379] R13: 00000000828cbfbf R14: 0000000000000000 R15: ffffaea1800ebdc0 [ 4.211379] FS: 0000000000000000(0000) GS:ffffa3273fd00000(0000) knlGS:0000000000000000 [ 4.211379] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4.211379] CR2: ffffffffffffffe8 CR3: 000000002820e001 CR4: 00000000001606e0 [ 4.211379] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4.211379] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4.211379] Call Trace: [ 4.211379] ? vsock_find_connected_socket+0x6c/0xe0 [ 4.211379] virtio_transport_recv_pkt+0x15f/0x740 [ 4.211379] ? detach_buf+0x1b5/0x210 [ 4.211379] virtio_transport_rx_work+0xb7/0x140 [ 4.211379] process_one_work+0x1ef/0x480 [ 4.211379] worker_thread+0x312/0x460 [ 4.211379] kthread+0x132/0x140 [ 4.211379] ? process_one_work+0x480/0x480 [ 4.211379] ? kthread_destroy_worker+0xd0/0xd0 [ 4.211379] ret_from_fork+0x35/0x40 [ 4.211379] Code: c7 47 08 00 00 00 00 66 c7 07 28 00 c7 47 08 ff ff ff ff c7 47 04 ff ff ff ff c3 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 8b 47 08 <3b> 46 08 75 0a 8b 47 04 3b 46 04 0f 94 c0 c3 31 c0 c3 90 66 2e [ 4.211379] RIP: vsock_addr_equals_addr+0x3/0x20 RSP: ffffaea1800ebd28 [ 4.211379] CR2: ffffffffffffffe8 [ 4.211379] ---[ end trace f31cc4a2e6df3689 ]--- [ 4.211379] Kernel panic - not syncing: Fatal exception in interrupt [ 4.211379] Kernel Offset: 0x37000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 4.211379] Rebooting in 5 seconds.. Fixes: 22b5c0b63f32 ("vsock/virtio: fix kernel panic after device hot-unplug") Cc: Stefan Hajnoczi Cc: Stefano Garzarella Cc: "David S. Miller" Cc: kvm@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Cc: netdev@vger.kernel.org Cc: kernel-team@android.com Cc: stable@vger.kernel.org [4.9+] Signed-off-by: Jorge E. Moreira Reviewed-by: Stefano Garzarella Reviewed-by: Stefan Hajnoczi Acked-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 15eb5d3d4750..96ab344f17bb 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -702,28 +702,27 @@ static int __init virtio_vsock_init(void) if (!virtio_vsock_workqueue) return -ENOMEM; - ret = register_virtio_driver(&virtio_vsock_driver); + ret = vsock_core_init(&virtio_transport.transport); if (ret) goto out_wq; - ret = vsock_core_init(&virtio_transport.transport); + ret = register_virtio_driver(&virtio_vsock_driver); if (ret) - goto out_vdr; + goto out_vci; return 0; -out_vdr: - unregister_virtio_driver(&virtio_vsock_driver); +out_vci: + vsock_core_exit(); out_wq: destroy_workqueue(virtio_vsock_workqueue); return ret; - } static void __exit virtio_vsock_exit(void) { - vsock_core_exit(); unregister_virtio_driver(&virtio_vsock_driver); + vsock_core_exit(); destroy_workqueue(virtio_vsock_workqueue); } -- cgit From ea9a03791a73e853897eda93e139018ca38f3c94 Mon Sep 17 00:00:00 2001 From: Patrick Talbert Date: Fri, 17 May 2019 17:11:28 +0200 Subject: net: Treat sock->sk_drops as an unsigned int when printing Currently, procfs socket stats format sk_drops as a signed int (%d). For large values this will cause a negative number to be printed. We know the drop count can never be a negative so change the format specifier to %u. Signed-off-by: Patrick Talbert Signed-off-by: David S. Miller --- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/datagram.c | 2 +- net/netlink/af_netlink.c | 2 +- net/phonet/socket.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 7ccb5f87f70b..834be7daeb32 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1113,7 +1113,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index dc91c27bb788..0e482f07b37f 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -1076,7 +1076,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3c58ba02af7d..8fb250ed53d4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2883,7 +2883,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), udp_rqueue_get(sp), diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index ee4a4e54d016..f07fb24f4ba1 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -1034,7 +1034,7 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, src = &sp->sk_v6_rcv_saddr; seq_printf(seq, "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n", + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", bucket, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 216ab915dd54..718a97d5f1fd 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2642,7 +2642,7 @@ static int netlink_seq_show(struct seq_file *seq, void *v) struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); - seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8d %-8lu\n", + seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n", s, s->sk_protocol, nlk->portid, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 30187990257f..2567af2fbd6f 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -607,7 +607,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) struct pn_sock *pn = pn_sk(sk); seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu " - "%d %pK %d", + "%d %pK %u", sk->sk_protocol, pn->sobject, pn->dobject, pn->resource, sk->sk_state, sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk), -- cgit From 85806af0c6bac0feb777e255a25fd5d0cf6ad38e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 May 2019 21:23:07 -0700 Subject: net: fix kernel-doc warnings for socket.c Fix kernel-doc warnings by moving the kernel-doc notation to be immediately above the functions that it describes. Fixes these warnings for sock_sendmsg() and sock_recvmsg(): ../net/socket.c:658: warning: Excess function parameter 'sock' description in 'INDIRECT_CALLABLE_DECLARE' ../net/socket.c:658: warning: Excess function parameter 'msg' description in 'INDIRECT_CALLABLE_DECLARE' ../net/socket.c:889: warning: Excess function parameter 'sock' description in 'INDIRECT_CALLABLE_DECLARE' ../net/socket.c:889: warning: Excess function parameter 'msg' description in 'INDIRECT_CALLABLE_DECLARE' ../net/socket.c:889: warning: Excess function parameter 'flags' description in 'INDIRECT_CALLABLE_DECLARE' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- net/socket.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 472fbefa5d9b..72372dc5dd70 100644 --- a/net/socket.c +++ b/net/socket.c @@ -645,14 +645,6 @@ void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) } EXPORT_SYMBOL(__sock_tx_timestamp); -/** - * sock_sendmsg - send a message through @sock - * @sock: socket - * @msg: message to send - * - * Sends @msg through @sock, passing through LSM. - * Returns the number of bytes sent, or an error code. - */ INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) @@ -663,6 +655,14 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) return ret; } +/** + * sock_sendmsg - send a message through @sock + * @sock: socket + * @msg: message to send + * + * Sends @msg through @sock, passing through LSM. + * Returns the number of bytes sent, or an error code. + */ int sock_sendmsg(struct socket *sock, struct msghdr *msg) { int err = security_socket_sendmsg(sock, msg, @@ -875,15 +875,6 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, } EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops); -/** - * sock_recvmsg - receive a message from @sock - * @sock: socket - * @msg: message to receive - * @flags: message flags - * - * Receives @msg from @sock, passing through LSM. Returns the total number - * of bytes received, or an error. - */ INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, size_t , int )); static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, @@ -893,6 +884,15 @@ static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, msg_data_left(msg), flags); } +/** + * sock_recvmsg - receive a message from @sock + * @sock: socket + * @msg: message to receive + * @flags: message flags + * + * Receives @msg from @sock, passing through LSM. Returns the total number + * of bytes received, or an error. + */ int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags) { int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags); -- cgit