From 8edc3affc0770886c7bfb3436b0fdd09bce13167 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Sat, 4 Mar 2017 08:57:33 -0800 Subject: rds: tcp: Take explicit refcounts on struct net It is incorrect for the rds_connection to piggyback on the sock_net() refcount for the netns because this gives rise to a chicken-and-egg problem during rds_conn_destroy. Instead explicitly take a ref on the net, and hold the netns down till the connection tear-down is complete. Reported-by: Dmitry Vyukov Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/rds/tcp.c') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index a973d3b4dff0..65c8e3b3b710 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -529,7 +529,7 @@ static void rds_tcp_kill_sock(struct net *net) flush_work(&rtn->rds_tcp_accept_w); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); + struct net *c_net = tc->t_cpath->cp_conn->c_net; if (net != c_net || !tc->t_sock) continue; @@ -584,7 +584,7 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); + struct net *c_net = tc->t_cpath->cp_conn->c_net; if (net != c_net || !tc->t_sock) continue; -- cgit From 16c09b1c7657522a321b04aa7f4300865b7cb292 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Sat, 4 Mar 2017 08:57:34 -0800 Subject: rds: tcp: Reorder initialization sequence in rds_tcp_init to avoid races Order of initialization in rds_tcp_init needs to be done so that resources are set up and destroyed in the correct synchronization sequence with both the data path, as well as netns create/destroy path. Specifically, - we must call register_pernet_subsys and get the rds_tcp_netid before calling register_netdevice_notifier, otherwise we risk the sequence 1. register_netdevice_notifier sets up netdev notifier callback 2. rds_tcp_dev_event -> rds_tcp_kill_sock uses netid 0, and finds the wrong rtn, resulting in a panic with string that is of the form: BUG: unable to handle kernel NULL pointer dereference at 000000000000000d IP: rds_tcp_kill_sock+0x3a/0x1d0 [rds_tcp] : - the rds_tcp_incoming_slab kmem_cache must be initialized before the datapath starts up. The latter can happen any time after the pernet_subsys registration of rds_tcp_net_ops, whose -> init function sets up the listen socket. If the rds_tcp_incoming_slab has not been set up at that time, a panic of the form below may be encountered BUG: unable to handle kernel NULL pointer dereference at 0000000000000014 IP: kmem_cache_alloc+0x90/0x1c0 : rds_tcp_data_recv+0x1e7/0x370 [rds_tcp] tcp_read_sock+0x96/0x1c0 rds_tcp_recv_path+0x65/0x80 [rds_tcp] : Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'net/rds/tcp.c') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 65c8e3b3b710..fbf807a0cc4a 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -638,19 +638,19 @@ static int rds_tcp_init(void) goto out; } - ret = register_netdevice_notifier(&rds_tcp_dev_notifier); - if (ret) { - pr_warn("could not register rds_tcp_dev_notifier\n"); + ret = rds_tcp_recv_init(); + if (ret) goto out_slab; - } ret = register_pernet_subsys(&rds_tcp_net_ops); if (ret) - goto out_notifier; + goto out_recv; - ret = rds_tcp_recv_init(); - if (ret) + ret = register_netdevice_notifier(&rds_tcp_dev_notifier); + if (ret) { + pr_warn("could not register rds_tcp_dev_notifier\n"); goto out_pernet; + } rds_trans_register(&rds_tcp_transport); @@ -660,9 +660,8 @@ static int rds_tcp_init(void) out_pernet: unregister_pernet_subsys(&rds_tcp_net_ops); -out_notifier: - if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) - pr_warn("could not unregister rds_tcp_dev_notifier\n"); +out_recv: + rds_tcp_recv_exit(); out_slab: kmem_cache_destroy(rds_tcp_conn_slab); out: -- cgit From b21dd4506b71bdb9c5a20e759255cd2513ea7ebe Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Sat, 4 Mar 2017 08:57:35 -0800 Subject: rds: tcp: Sequence teardown of listen and acceptor sockets to avoid races Commit a93d01f5777e ("RDS: TCP: avoid bad page reference in rds_tcp_listen_data_ready") added the function rds_tcp_listen_sock_def_readable() to handle the case when a partially set-up acceptor socket drops into rds_tcp_listen_data_ready(). However, if the listen socket (rtn->rds_tcp_listen_sock) is itself going through a tear-down via rds_tcp_listen_stop(), the (*ready)() will be null and we would hit a panic of the form BUG: unable to handle kernel NULL pointer dereference at (null) IP: (null) : ? rds_tcp_listen_data_ready+0x59/0xb0 [rds_tcp] tcp_data_queue+0x39d/0x5b0 tcp_rcv_established+0x2e5/0x660 tcp_v4_do_rcv+0x122/0x220 tcp_v4_rcv+0x8b7/0x980 : In the above case, it is not fatal to encounter a NULL value for ready- we should just drop the packet and let the flush of the acceptor thread finish gracefully. In general, the tear-down sequence for listen() and accept() socket that is ensured by this commit is: rtn->rds_tcp_listen_sock = NULL; /* prevent any new accepts */ In rds_tcp_listen_stop(): serialize with, and prevent, further callbacks using lock_sock() flush rds_wq flush acceptor workq sock_release(listen socket) Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/rds/tcp.c') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index fbf807a0cc4a..225690076773 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -484,9 +484,10 @@ static void __net_exit rds_tcp_exit_net(struct net *net) * we do need to clean up the listen socket here. */ if (rtn->rds_tcp_listen_sock) { - rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); + struct socket *lsock = rtn->rds_tcp_listen_sock; + rtn->rds_tcp_listen_sock = NULL; - flush_work(&rtn->rds_tcp_accept_w); + rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); } } @@ -523,10 +524,10 @@ static void rds_tcp_kill_sock(struct net *net) struct rds_tcp_connection *tc, *_tc; LIST_HEAD(tmp_list); struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + struct socket *lsock = rtn->rds_tcp_listen_sock; - rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); rtn->rds_tcp_listen_sock = NULL; - flush_work(&rtn->rds_tcp_accept_w); + rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { struct net *c_net = tc->t_cpath->cp_conn->c_net; @@ -546,8 +547,12 @@ static void rds_tcp_kill_sock(struct net *net) void *rds_tcp_listen_sock_def_readable(struct net *net) { struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + struct socket *lsock = rtn->rds_tcp_listen_sock; + + if (!lsock) + return NULL; - return rtn->rds_tcp_listen_sock->sk->sk_user_data; + return lsock->sk->sk_user_data; } static int rds_tcp_dev_event(struct notifier_block *this, -- cgit