diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2023-08-30 16:06:38 -0700 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2023-08-30 16:06:38 -0700 |
commit | 1ac731c529cd4d6adbce134754b51ff7d822b145 (patch) | |
tree | 143ab3f35ca5f3b69f583c84e6964b17139c2ec1 /net/mptcp | |
parent | 07b4c950f27bef0362dc6ad7ee713aab61d58149 (diff) | |
parent | 54116d442e001e1b6bd482122043b1870998a1f3 (diff) |
Merge branch 'next' into for-linus
Prepare input updates for 6.6 merge window.
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/fastopen.c | 11 | ||||
-rw-r--r-- | net/mptcp/options.c | 14 | ||||
-rw-r--r-- | net/mptcp/pm.c | 27 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 41 | ||||
-rw-r--r-- | net/mptcp/pm_userspace.c | 52 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 487 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 42 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 46 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 188 |
9 files changed, 521 insertions, 387 deletions
diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index d237d142171c..bceaab8dd8e4 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -9,11 +9,18 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req) { - struct sock *ssk = subflow->tcp_sock; - struct sock *sk = subflow->conn; + struct sock *sk, *ssk; struct sk_buff *skb; struct tcp_sock *tp; + /* on early fallback the subflow context is deleted by + * subflow_syn_recv_sock() + */ + if (!subflow) + return; + + ssk = subflow->tcp_sock; + sk = subflow->conn; tp = tcp_sk(ssk); subflow->is_mptfo = 1; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b30cea2fbf3f..19a01b6566f1 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -442,7 +442,6 @@ static void clear_3rdack_retransmission(struct sock *sk) static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, - unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -556,7 +555,6 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, - unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -580,7 +578,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, opts->ext_copy = *mpext; } - remaining -= map_size; dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); @@ -851,9 +848,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, } snd_data_fin = mptcp_data_fin_enabled(msk); - if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) + if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts)) ret = true; - else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) { + else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) { unsigned int mp_fail_size; ret = true; @@ -1001,7 +998,7 @@ check_notify: clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk); } else { - mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC); + mptcp_pm_fully_established(msk, ssk); } return true; @@ -1192,9 +1189,8 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && - mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64) && - schedule_work(&msk->work)) - sock_hold(subflow->conn); + mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64)) + mptcp_schedule_work((struct sock *)msk); return true; } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 70f0ced3ca86..76612bca275a 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -87,8 +87,15 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) unsigned int subflows_max; int ret = 0; - if (mptcp_pm_is_userspace(msk)) - return mptcp_userspace_pm_active(msk); + if (mptcp_pm_is_userspace(msk)) { + if (mptcp_userspace_pm_active(msk)) { + spin_lock_bh(&pm->lock); + pm->subflows++; + spin_unlock_bh(&pm->lock); + return true; + } + return false; + } subflows_max = mptcp_pm_get_subflows_max(msk); @@ -126,7 +133,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return true; } -void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) +void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) { struct mptcp_pm_data *pm = &msk->pm; bool announce = false; @@ -150,7 +157,7 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, spin_unlock_bh(&pm->lock); if (announce) - mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, gfp); + mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC); } void mptcp_pm_connection_closed(struct mptcp_sock *msk) @@ -181,8 +188,16 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk, struct mptcp_pm_data *pm = &msk->pm; bool update_subflows; - update_subflows = (subflow->request_join || subflow->mp_join) && - mptcp_pm_is_kernel(msk); + update_subflows = subflow->request_join || subflow->mp_join; + if (mptcp_pm_is_userspace(msk)) { + if (update_subflows) { + spin_lock_bh(&pm->lock); + pm->subflows--; + spin_unlock_bh(&pm->lock); + } + return; + } + if (!READ_ONCE(pm->work_pending) && !update_subflows) return; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 56628b52d100..1224dfca5bf3 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -997,9 +997,13 @@ out: return ret; } +static struct lock_class_key mptcp_slock_keys[2]; +static struct lock_class_key mptcp_keys[2]; + static int mptcp_pm_nl_create_listen_socket(struct sock *sk, struct mptcp_pm_addr_entry *entry) { + bool is_ipv6 = sk->sk_family == AF_INET6; int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; struct socket *ssock; @@ -1016,11 +1020,23 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (!newsk) return -EINVAL; + /* The subflow socket lock is acquired in a nested to the msk one + * in several places, even by the TCP stack, and this msk is a kernel + * socket: lockdep complains. Instead of propagating the _nested + * modifiers in several places, re-init the lock class for the msk + * socket to an mptcp specific one. + */ + sock_lock_init_class_and_name(newsk, + is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", + &mptcp_slock_keys[is_ipv6], + is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", + &mptcp_keys[is_ipv6]); + lock_sock(newsk); ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); release_sock(newsk); - if (!ssock) - return -EINVAL; + if (IS_ERR(ssock)) + return PTR_ERR(ssock); mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1031,6 +1047,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; + inet_sk_state_store(newsk, TCP_LISTEN); err = kernel_listen(ssock, backlog); if (err) return err; @@ -1542,6 +1559,24 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) return ret; } +void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) +{ + struct mptcp_rm_list alist = { .nr = 0 }; + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, rm_list, list) { + remove_anno_list_by_saddr(msk, &entry->addr); + if (alist.nr < MPTCP_RM_IDS_MAX) + alist.ids[alist.nr++] = entry->addr.id; + } + + if (alist.nr) { + spin_lock_bh(&msk->pm.lock); + mptcp_pm_remove_addr(msk, &alist); + spin_unlock_bh(&msk->pm.lock); + } +} + void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, struct list_head *rm_list) { @@ -2019,7 +2054,7 @@ static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) return -EMSGSIZE; - sk_err = ssk->sk_err; + sk_err = READ_ONCE(ssk->sk_err); if (sk_err && sk->sk_state == TCP_ESTABLISHED && nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err)) return -EMSGSIZE; diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a02d3cbf2a1b..b06aa58dfcf2 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -25,8 +25,8 @@ void mptcp_free_local_addr_list(struct mptcp_sock *msk) } } -int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, - struct mptcp_pm_addr_entry *entry) +static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry) { DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_pm_addr_entry *match = NULL; @@ -69,6 +69,7 @@ int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, MPTCP_PM_MAX_ADDR_ID + 1, 1); list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); + msk->pm.local_addr_used++; ret = e->addr.id; } else if (match) { ret = entry->addr.id; @@ -79,6 +80,31 @@ append_err: return ret; } +/* If the subflow is closed from the other peer (not via a + * subflow destroy command then), we want to keep the entry + * not to assign the same ID to another address and to be + * able to send RM_ADDR after the removal of the subflow. + */ +static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *addr) +{ + struct mptcp_pm_addr_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) { + /* TODO: a refcount is needed because the entry can + * be used multiple times (e.g. fullmesh mode). + */ + list_del_rcu(&entry->list); + kfree(entry); + msk->pm.local_addr_used--; + return 0; + } + } + + return -EINVAL; +} + int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) @@ -171,6 +197,7 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) spin_lock_bh(&msk->pm.lock); if (mptcp_pm_alloc_anno_list(msk, &addr_val)) { + msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &addr_val.addr, false); mptcp_pm_nl_addr_send_ack(msk); } @@ -232,7 +259,7 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) list_move(&match->list, &free_list); - mptcp_pm_remove_addrs_and_subflows(msk, &free_list); + mptcp_pm_remove_addrs(msk, &free_list); release_sock((struct sock *)msk); @@ -251,6 +278,7 @@ int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_pm_addr_entry local = { 0 }; struct mptcp_addr_info addr_r; struct mptcp_addr_info addr_l; struct mptcp_sock *msk; @@ -302,12 +330,26 @@ int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) goto create_err; } + local.addr = addr_l; + err = mptcp_userspace_pm_append_new_local_addr(msk, &local); + if (err < 0) { + GENL_SET_ERR_MSG(info, "did not match address and id"); + goto create_err; + } + lock_sock(sk); err = __mptcp_subflow_connect(sk, &addr_l, &addr_r); release_sock(sk); + spin_lock_bh(&msk->pm.lock); + if (err) + mptcp_userspace_pm_delete_local_addr(msk, &local); + else + msk->pm.subflows++; + spin_unlock_bh(&msk->pm.lock); + create_err: sock_put((struct sock *)msk); return err; @@ -420,7 +462,11 @@ int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); if (ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_pm_addr_entry entry = { .addr = addr_l }; + spin_lock_bh(&msk->pm.lock); + mptcp_userspace_pm_delete_local_addr(msk, &entry); + spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); mptcp_close_ssk(sk, ssk, subflow); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3ad9c46202fc..a6c7f2d24909 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -44,23 +44,11 @@ enum { static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); -static void __mptcp_check_send_data_fin(struct sock *sk); +static void mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); static struct net_device mptcp_napi_dev; -/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not - * completed yet or has failed, return the subflow socket. - * Otherwise return NULL. - */ -struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) -{ - if (!msk->subflow || READ_ONCE(msk->can_ack)) - return NULL; - - return msk->subflow; -} - /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) { @@ -102,8 +90,8 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) if (err) return err; - msk->first = ssock->sk; - msk->subflow = ssock; + WRITE_ONCE(msk->first, ssock->sk); + WRITE_ONCE(msk->subflow, ssock); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); @@ -116,6 +104,31 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) return 0; } +/* If the MPC handshake is not started, returns the first subflow, + * eventually allocating it. + */ +struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + int ret; + + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + return ERR_PTR(-EINVAL); + + if (!msk->subflow) { + if (msk->first) + return ERR_PTR(-EINVAL); + + ret = __mptcp_socket_create(msk); + if (ret) + return ERR_PTR(ret); + + mptcp_sockopt_sync(msk, msk->first); + } + + return msk->subflow; +} + static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); @@ -411,8 +424,7 @@ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - return !__mptcp_check_fallback(msk) && - ((1 << sk->sk_state) & + return ((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && msk->write_seq == READ_ONCE(msk->snd_una); } @@ -459,7 +471,7 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) return false; } -static void mptcp_set_datafin_timeout(const struct sock *sk) +static void mptcp_set_datafin_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 retransmits; @@ -570,9 +582,6 @@ static bool mptcp_check_data_fin(struct sock *sk) u64 rcv_data_fin_seq; bool ret = false; - if (__mptcp_check_fallback(msk)) - return ret; - /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. * msk->rcv_data_fin was set when parsing the incoming options @@ -590,7 +599,7 @@ static bool mptcp_check_data_fin(struct sock *sk) WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); - sk->sk_shutdown |= RCV_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ switch (sk->sk_state) { @@ -610,7 +619,8 @@ static bool mptcp_check_data_fin(struct sock *sk) } ret = true; - mptcp_send_ack(msk); + if (!__mptcp_check_fallback(msk)) + mptcp_send_ack(msk); mptcp_close_wake_up(sk); } return ret; @@ -812,6 +822,13 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) mptcp_data_unlock(sk); } +static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) +{ + mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); + WRITE_ONCE(msk->allow_infinite_fallback, false); + mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); +} + static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; @@ -825,17 +842,17 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) if (sk->sk_socket && !ssk->sk_socket) mptcp_sock_graft(ssk, sk->sk_socket); - mptcp_propagate_sndbuf((struct sock *)msk, ssk); mptcp_sockopt_sync_locked(msk, ssk); + mptcp_subflow_joined(msk, ssk); return true; } -static void __mptcp_flush_join_list(struct sock *sk) +static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list) { struct mptcp_subflow_context *tmp, *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) { + list_for_each_entry_safe(subflow, tmp, join_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); @@ -877,49 +894,6 @@ bool mptcp_schedule_work(struct sock *sk) return false; } -void mptcp_subflow_eof(struct sock *sk) -{ - if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) - mptcp_schedule_work(sk); -} - -static void mptcp_check_for_eof(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - int receivers = 0; - - mptcp_for_each_subflow(msk, subflow) - receivers += !subflow->rx_eof; - if (receivers) - return; - - if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { - /* hopefully temporary hack: propagate shutdown status - * to msk, when all subflows agree on it - */ - sk->sk_shutdown |= RCV_SHUTDOWN; - - smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - sk->sk_data_ready(sk); - } - - switch (sk->sk_state) { - case TCP_ESTABLISHED: - inet_sk_state_store(sk, TCP_CLOSE_WAIT); - break; - case TCP_FIN_WAIT1: - inet_sk_state_store(sk, TCP_CLOSING); - break; - case TCP_FIN_WAIT2: - inet_sk_state_store(sk, TCP_CLOSE); - break; - default: - return; - } - mptcp_close_wake_up(sk); -} - static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -1589,7 +1563,7 @@ out: if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); if (do_check_data_fin) - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) @@ -1663,16 +1637,33 @@ static void mptcp_set_nospace(struct sock *sk) static int mptcp_disconnect(struct sock *sk, int flags); -static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg, +static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, size_t len, int *copied_syn) { unsigned int saved_flags = msg->msg_flags; struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + struct sock *ssk; int ret; + /* on flags based fastopen the mptcp is supposed to create the + * first subflow right now. Otherwise we are in the defer_connect + * path, and the first subflow must be already present. + * Since the defer_connect flag is cleared after the first succsful + * fastopen attempt, no need to check for additional subflow status. + */ + if (msg->msg_flags & MSG_FASTOPEN) { + ssock = __mptcp_nmpc_socket(msk); + if (IS_ERR(ssock)) + return PTR_ERR(ssock); + } + if (!msk->first) + return -EINVAL; + + ssk = msk->first; + lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; - msk->connect_flags = O_NONBLOCK; msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); msk->fastopening = 0; @@ -1690,8 +1681,15 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; } else if (ret && ret != -EINPROGRESS) { - mptcp_disconnect(sk, 0); + /* The disconnect() op called by tcp_sendmsg_fastopen()/ + * __inet_stream_connect() can fail, due to looking check, + * see mptcp_disconnect(). + * Attempt it again outside the problematic scope. + */ + if (!mptcp_disconnect(sk, 0)) + sk->sk_socket->state = SS_UNCONNECTED; } + inet_sk(sk)->defer_connect = 0; return ret; } @@ -1700,7 +1698,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; - struct socket *ssock; size_t copied = 0; int ret = 0; long timeo; @@ -1710,12 +1707,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (unlikely(ssock && (inet_sk(ssock->sk)->defer_connect || - msg->msg_flags & MSG_FASTOPEN))) { + if (unlikely(inet_sk(sk)->defer_connect || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; - ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn); + ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); copied += copied_syn; if (ret == -EINPROGRESS && copied_syn > 0) goto out; @@ -2123,9 +2118,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, break; } - if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) - mptcp_check_for_eof(msk); - if (sk->sk_shutdown & RCV_SHUTDOWN) { /* race breaker: the shutdown could be after the * previous receive queue check @@ -2255,7 +2247,7 @@ static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) { if (msk->subflow) { iput(SOCK_INODE(msk->subflow)); - msk->subflow = NULL; + WRITE_ONCE(msk->subflow, NULL); } } @@ -2316,7 +2308,26 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - bool need_push, dispose_it; + bool dispose_it, need_push = false; + + /* If the first subflow moved to a close state before accept, e.g. due + * to an incoming reset, mptcp either: + * - if either the subflow or the msk are dead, destroy the context + * (the subflow socket is deleted by inet_child_forget) and the msk + * - otherwise do nothing at the moment and take action at accept and/or + * listener shutdown - user-space must be able to accept() the closed + * socket. + */ + if (msk->in_accept_queue && msk->first == ssk) { + if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD)) + return; + + /* ensure later check in mptcp_worker() will dispose the msk */ + sock_set_flag(sk, SOCK_DEAD); + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + mptcp_subflow_drop_ctx(ssk); + goto out_release; + } dispose_it = !msk->subflow || ssk != msk->subflow->sk; if (dispose_it) @@ -2335,7 +2346,10 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { - tcp_disconnect(ssk, 0); + /* The MPTCP code never wait on the subflow sockets, TCP-level + * disconnect should never fail + */ + WARN_ON_ONCE(tcp_disconnect(ssk, 0)); msk->subflow->state = SS_UNCONNECTED; mptcp_subflow_ctx_reset(subflow); release_sock(ssk); @@ -2343,7 +2357,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, goto out; } - sock_orphan(ssk); subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops @@ -2351,26 +2364,23 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, * reference owned by msk; */ if (!inet_csk(ssk)->icsk_ulp_ops) { + WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ - if (ssk->sk_state == TCP_LISTEN) { - tcp_set_state(ssk, TCP_CLOSE); - mptcp_subflow_queue_clean(sk, ssk); - inet_csk_listen_stop(ssk); - mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); - } __tcp_close(ssk, 0); /* close acquired an extra ref */ __sock_put(ssk); } + +out_release: release_sock(ssk); sock_put(ssk); if (ssk == msk->first) - msk->first = NULL; + WRITE_ONCE(msk->first, NULL); out: if (ssk == msk->last_snd) @@ -2399,9 +2409,10 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } -static void __mptcp_close_subflow(struct mptcp_sock *msk) +static void __mptcp_close_subflow(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); might_sleep(); @@ -2415,16 +2426,17 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk) if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) continue; - mptcp_close_ssk((struct sock *)msk, ssk, subflow); + mptcp_close_ssk(sk, ssk, subflow); } + } -static bool mptcp_check_close_timeout(const struct sock *sk) +static bool mptcp_should_close(const struct sock *sk) { s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; struct mptcp_subflow_context *subflow; - if (delta >= TCP_TIMEWAIT_LEN) + if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue) return true; /* if all subflows are in closed status don't bother with additional @@ -2463,19 +2475,19 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) /* Mirror the tcp_reset() error propagation */ switch (sk->sk_state) { case TCP_SYN_SENT: - sk->sk_err = ECONNREFUSED; + WRITE_ONCE(sk->sk_err, ECONNREFUSED); break; case TCP_CLOSE_WAIT: - sk->sk_err = EPIPE; + WRITE_ONCE(sk->sk_err, EPIPE); break; case TCP_CLOSE: return; default: - sk->sk_err = ECONNRESET; + WRITE_ONCE(sk->sk_err, ECONNRESET); } inet_sk_state_store(sk, TCP_CLOSE); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); @@ -2609,27 +2621,26 @@ static void mptcp_worker(struct work_struct *work) lock_sock(sk); state = sk->sk_state; - if (unlikely(state == TCP_CLOSE)) + if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; - mptcp_check_data_fin_ack(sk); - mptcp_check_fastclose(msk); mptcp_pm_nl_work(msk); - if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) - mptcp_check_for_eof(msk); - - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); + mptcp_check_data_fin_ack(sk); mptcp_check_data_fin(sk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + __mptcp_close_subflow(sk); + /* There is no point in keeping around an orphaned sk timedout or * closed, but we need the msk around to reply to incoming DATA_FIN, * even if it is orphaned and in FIN_WAIT2 state */ if (sock_flag(sk, SOCK_DEAD)) { - if (mptcp_check_close_timeout(sk)) { + if (mptcp_should_close(sk)) { inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); } @@ -2639,9 +2650,6 @@ static void mptcp_worker(struct work_struct *work) } } - if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - __mptcp_close_subflow(msk); - if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); @@ -2669,7 +2677,7 @@ static int __mptcp_init_sock(struct sock *sk) WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; - msk->first = NULL; + WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->allow_infinite_fallback, true); @@ -2711,10 +2719,6 @@ static int mptcp_init_sock(struct sock *sk) if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; - ret = __mptcp_socket_create(mptcp_sk(sk)); - if (ret) - return ret; - set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will @@ -2757,13 +2761,19 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) break; fallthrough; case TCP_SYN_SENT: - tcp_disconnect(ssk, O_NONBLOCK); + WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK)); break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { pr_debug("Fallback"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); + + /* simulate the data_fin ack reception to let the state + * machine move forward + */ + WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); + mptcp_schedule_work(sk); } else { pr_debug("Sending DATA_FIN on subflow %p", ssk); tcp_send_ack(ssk); @@ -2803,7 +2813,7 @@ static int mptcp_close_state(struct sock *sk) return next & TCP_ACTION_FIN; } -static void __mptcp_check_send_data_fin(struct sock *sk) +static void mptcp_check_send_data_fin(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); @@ -2821,19 +2831,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk) WRITE_ONCE(msk->snd_nxt, msk->write_seq); - /* fallback socket will not get data_fin/ack, can move to the next - * state now - */ - if (__mptcp_check_fallback(msk)) { - WRITE_ONCE(msk->snd_una, msk->write_seq); - if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { - inet_sk_state_store(sk, TCP_CLOSE); - mptcp_close_wake_up(sk); - } else if (sk->sk_state == TCP_FIN_WAIT1) { - inet_sk_state_store(sk, TCP_FIN_WAIT2); - } - } - mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2853,7 +2850,7 @@ static void __mptcp_wr_shutdown(struct sock *sk) WRITE_ONCE(msk->write_seq, msk->write_seq + 1); WRITE_ONCE(msk->snd_data_fin_enable, 1); - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); } static void __mptcp_destroy_sock(struct sock *sk) @@ -2878,6 +2875,14 @@ static void __mptcp_destroy_sock(struct sock *sk) sock_put(sk); } +void __mptcp_unaccepted_force_close(struct sock *sk) +{ + sock_set_flag(sk, SOCK_DEAD); + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + __mptcp_destroy_sock(sk); +} + static __poll_t mptcp_check_readable(struct mptcp_sock *msk) { /* Concurrent splices from sk_receive_queue into receive_queue will @@ -2890,10 +2895,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) return EPOLLIN | EPOLLRDNORM; } -static void mptcp_listen_inuse_dec(struct sock *sk) +static void mptcp_check_listen_stop(struct sock *sk) { - if (inet_sk_state_load(sk) == TCP_LISTEN) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + struct sock *ssk; + + if (inet_sk_state_load(sk) != TCP_LISTEN) + return; + + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + ssk = mptcp_sk(sk)->first; + if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN)) + return; + + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + mptcp_subflow_queue_clean(sk, ssk); + inet_csk_listen_stop(ssk); + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); + tcp_set_state(ssk, TCP_CLOSE); + release_sock(ssk); } bool __mptcp_close(struct sock *sk, long timeout) @@ -2903,18 +2922,21 @@ bool __mptcp_close(struct sock *sk, long timeout) bool do_cancel_work = false; int subflows_alive = 0; - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { - mptcp_listen_inuse_dec(sk); + mptcp_check_listen_stop(sk); inet_sk_state_store(sk, TCP_CLOSE); goto cleanup; } - if (mptcp_check_readable(msk)) { - /* the msk has read data, do the MPTCP equivalent of TCP reset */ + if (mptcp_check_readable(msk) || timeout < 0) { + /* If the msk has read data, or the caller explicitly ask it, + * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose + */ inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); + timeout = 0; } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); } @@ -2980,7 +3002,7 @@ static void mptcp_close(struct sock *sk, long timeout) sock_put(sk); } -void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); @@ -3007,15 +3029,20 @@ static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); + /* Deny disconnect if other threads are blocked in sk_wait_event() + * or inet_wait_for_connect(). + */ + if (sk->sk_wait_pending) + return -EBUSY; + /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under - * msk->firstsocket lock). Do nothing and leave the cleanup to the - * caller. + * msk->firstsocket lock). */ if (msk->fastopening) - return 0; + return -EBUSY; - mptcp_listen_inuse_dec(sk); + mptcp_check_listen_stop(sk); inet_sk_state_store(sk, TCP_CLOSE); mptcp_stop_timer(sk); @@ -3043,7 +3070,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); - sk->sk_shutdown = 0; + WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); return 0; } @@ -3057,9 +3084,10 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) } #endif -struct sock *mptcp_sk_clone(const struct sock *sk, - const struct mptcp_options_received *mp_opt, - struct request_sock *req) +struct sock *mptcp_sk_clone_init(const struct sock *sk, + const struct mptcp_options_received *mp_opt, + struct sock *ssk, + struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); @@ -3073,12 +3101,14 @@ struct sock *mptcp_sk_clone(const struct sock *sk, inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif + nsk->sk_wait_pending = 0; __mptcp_init_sock(nsk); msk = mptcp_sk(nsk); msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; - msk->subflow = NULL; + WRITE_ONCE(msk->subflow, NULL); + msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); @@ -3090,14 +3120,33 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; sock_reset_flag(nsk, SOCK_RCU_FREE); - /* will be fully established after successful MPC subflow creation */ - inet_sk_state_store(nsk, TCP_SYN_RECV); - security_inet_csk_clone(nsk, req); + + /* this can't race with mptcp_close(), as the msk is + * not yet exposted to user-space + */ + inet_sk_state_store(nsk, TCP_ESTABLISHED); + + /* The msk maintain a ref to each subflow in the connections list */ + WRITE_ONCE(msk->first, ssk); + list_add(&mptcp_subflow_ctx(ssk)->node, &msk->conn_list); + sock_hold(ssk); + + /* new mpc subflow takes ownership of the newly + * created mptcp socket + */ + mptcp_token_accept(subflow_req, msk); + + /* set msk addresses early to ensure mptcp_pm_get_local_id() + * uses the correct data + */ + mptcp_copy_inaddrs(nsk, ssk); + mptcp_propagate_sndbuf(nsk, ssk); + + mptcp_rcv_space_init(msk, ssk); bh_unlock_sock(nsk); - /* keep a single reference */ - __sock_put(nsk); + /* note: the newly allocated socket refcount is 2 now */ return nsk; } @@ -3126,7 +3175,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, struct socket *listener; struct sock *newsk; - listener = __mptcp_nmpc_socket(msk); + listener = READ_ONCE(msk->subflow); if (WARN_ON_ONCE(!listener)) { *err = -EINVAL; return NULL; @@ -3153,8 +3202,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, goto out; } - /* acquire the 2nd reference for the owning socket */ - sock_hold(new_mptcp_sock); newsk = new_mptcp_sock; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); } else { @@ -3242,9 +3289,14 @@ static void mptcp_release_cb(struct sock *sk) for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) | msk->push_pending; + struct list_head join_list; + if (!flags) break; + INIT_LIST_HEAD(&join_list); + list_splice_init(&msk->join_list, &join_list); + /* the following actions acquire the subflow socket lock * * 1) can't be invoked in atomic scope @@ -3255,8 +3307,9 @@ static void mptcp_release_cb(struct sock *sk) msk->push_pending = 0; msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); + if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) - __mptcp_flush_join_list(sk); + __mptcp_flush_join_list(sk, &join_list); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) @@ -3348,7 +3401,7 @@ static int mptcp_get_port(struct sock *sk, unsigned short snum) struct mptcp_sock *msk = mptcp_sk(sk); struct socket *ssock; - ssock = __mptcp_nmpc_socket(msk); + ssock = msk->subflow; pr_debug("msk=%p, subflow=%p", msk, ssock); if (WARN_ON_ONCE(!ssock)) return -EINVAL; @@ -3408,14 +3461,16 @@ bool mptcp_finish_join(struct sock *ssk) return false; } - if (!list_empty(&subflow->node)) - goto out; + /* active subflow, already present inside the conn_list */ + if (!list_empty(&subflow->node)) { + mptcp_subflow_joined(msk, ssk); + return true; + } if (!mptcp_pm_allow_new_subflow(msk)) goto err_prohibited; - /* active connections are already on conn_list. - * If we can't acquire msk socket lock here, let the release callback + /* If we can't acquire msk socket lock here, let the release callback * handle it */ mptcp_data_lock(parent); @@ -3438,11 +3493,6 @@ err_prohibited: return false; } - subflow->map_seq = READ_ONCE(msk->ack_seq); - WRITE_ONCE(msk->allow_infinite_fallback, false); - -out: - mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); return true; } @@ -3536,8 +3586,8 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) int err = -EINVAL; ssock = __mptcp_nmpc_socket(msk); - if (!ssock) - return -EINVAL; + if (IS_ERR(ssock)) + return PTR_ERR(ssock); mptcp_token_destroy(msk); inet_sk_state_store(sk, TCP_SYN_SENT); @@ -3560,9 +3610,9 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * acquired the subflow socket lock, too. */ if (msk->fastopening) - err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1); + err = __inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK, 1); else - err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags); + err = inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK); inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; /* on successful connect, the msk state will be moved to established by @@ -3575,12 +3625,10 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) mptcp_copy_inaddrs(sk, ssock->sk); - /* unblocking connect, mptcp-level inet_stream_connect will error out - * without changing the socket state, update it here. + /* silence EINPROGRESS and let the caller inet_stream_connect + * handle the connection in progress */ - if (err == -EINPROGRESS) - sk->sk_socket->state = ssock->state; - return err; + return 0; } static struct proto mptcp_prot = { @@ -3625,8 +3673,8 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) lock_sock(sock->sk); ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - err = -EINVAL; + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); goto unlock; } @@ -3639,18 +3687,6 @@ unlock: return err; } -static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) -{ - int ret; - - lock_sock(sock->sk); - mptcp_sk(sock->sk)->connect_flags = flags; - ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); - release_sock(sock->sk); - return ret; -} - static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); @@ -3662,8 +3698,8 @@ static int mptcp_listen(struct socket *sock, int backlog) lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - err = -EINVAL; + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); goto unlock; } @@ -3694,7 +3730,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, pr_debug("msk=%p", msk); - ssock = __mptcp_nmpc_socket(msk); + /* Buggy applications can call accept on socket states other then LISTEN + * but no need to allocate the first subflow just to error out. + */ + ssock = READ_ONCE(msk->subflow); if (!ssock) return -EINVAL; @@ -3705,25 +3744,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct sock *newsk = newsock->sk; set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + msk->in_accept_queue = 0; lock_sock(newsk); - /* PM/worker can now acquire the first subflow socket - * lock without racing with listener queue cleanup, - * we can notify it, if needed. - * - * Even if remote has reset the initial subflow by now - * the refcnt is still at least one. - */ - subflow = mptcp_subflow_ctx(msk->first); - list_add(&subflow->node, &msk->conn_list); - sock_hold(msk->first); - if (mptcp_is_fully_established(newsk)) - mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL); - - mptcp_rcv_space_init(msk, msk->first); - mptcp_propagate_sndbuf(newsk, msk->first); - /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ @@ -3733,6 +3757,18 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } + + /* Do late cleanup for the first subflow as necessary. Also + * deal with bad peers not doing a complete shutdown. + */ + if (msk->first && + unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { + __mptcp_close_ssk(newsk, msk->first, + mptcp_subflow_ctx(msk->first), 0); + if (unlikely(list_empty(&msk->conn_list))) + inet_sk_state_store(newsk, TCP_CLOSE); + } + release_sock(newsk); } @@ -3743,9 +3779,6 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; - if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) - return EPOLLOUT | EPOLLWRNORM; - if (sk_stream_is_writeable(sk)) return EPOLLOUT | EPOLLWRNORM; @@ -3763,6 +3796,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; struct mptcp_sock *msk; __poll_t mask = 0; + u8 shutdown; int state; msk = mptcp_sk(sk); @@ -3771,27 +3805,34 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) { - if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk)) + struct socket *ssock = READ_ONCE(msk->subflow); + + if (WARN_ON_ONCE(!ssock || !ssock->sk)) return 0; - return inet_csk_listen_poll(msk->subflow->sk); + return inet_csk_listen_poll(ssock->sk); } + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + mask |= EPOLLHUP; + if (shutdown & RCV_SHUTDOWN) + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - mask |= mptcp_check_writeable(msk); + if (shutdown & SEND_SHUTDOWN) + mask |= EPOLLOUT | EPOLLWRNORM; + else + mask |= mptcp_check_writeable(msk); } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } - if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) - mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); - if (sk->sk_err) + if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; return mask; @@ -3802,7 +3843,7 @@ static const struct proto_ops mptcp_stream_ops = { .owner = THIS_MODULE, .release = inet_release, .bind = mptcp_bind, - .connect = mptcp_stream_connect, + .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet_getname, @@ -3897,7 +3938,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .owner = THIS_MODULE, .release = inet6_release, .bind = mptcp_bind, - .connect = mptcp_stream_connect, + .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet6_getname, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 61fd8eabfca2..d3783a7056e1 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -113,7 +113,6 @@ /* MPTCP socket atomic flags */ #define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 -#define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 @@ -295,8 +294,8 @@ struct mptcp_sock { u8 recvmsg_inq:1, cork:1, nodelay:1, - fastopening:1; - int connect_flags; + fastopening:1, + in_accept_queue:1; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; @@ -305,7 +304,11 @@ struct mptcp_sock { struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; - struct socket *subflow; /* outgoing connect/listener/!mp_capable */ + struct socket *subflow; /* outgoing connect/listener/!mp_capable + * The mptcp ops can safely dereference, using suitable + * ONCE annotation, the subflow outside the socket + * lock as such sock is freed after close(). + */ struct sock *first; struct mptcp_pm_data pm; struct { @@ -333,10 +336,7 @@ static inline void msk_owned_by_me(const struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); } -static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) -{ - return (struct mptcp_sock *)sk; -} +#define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) /* the msk socket don't use the backlog, also account for the bulk * free memory @@ -370,7 +370,7 @@ static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); + const struct mptcp_sock *msk = mptcp_sk(sk); if (!msk->first_pending) return NULL; @@ -381,7 +381,7 @@ static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); } -static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) +static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -475,14 +475,13 @@ struct mptcp_subflow_context { send_mp_fail : 1, send_fastclose : 1, send_infinite_map : 1, - rx_eof : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ - __unused : 8; + __unused : 9; enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -615,7 +614,6 @@ int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); int mptcp_get_pm_type(const struct net *net); -void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); @@ -630,9 +628,10 @@ void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); -struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); +struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); +void __mptcp_unaccepted_force_close(struct sock *sk); void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, @@ -666,6 +665,8 @@ void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); +void mptcp_subflow_drop_ctx(struct sock *ssk); + static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { @@ -682,9 +683,10 @@ void __init mptcp_proto_init(void); int __init mptcp_proto_v6_init(void); #endif -struct sock *mptcp_sk_clone(const struct sock *sk, - const struct mptcp_options_received *mp_opt, - struct request_sock *req); +struct sock *mptcp_sk_clone_init(const struct sock *sk, + const struct mptcp_options_received *mp_opt, + struct sock *ssk, + struct request_sock *req); void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); @@ -716,7 +718,6 @@ static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); -void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { @@ -783,7 +784,7 @@ bool mptcp_pm_addr_families_match(const struct sock *sk, void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); -void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); +void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk); @@ -828,11 +829,10 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); +void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list); void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, struct list_head *rm_list); -int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, - struct mptcp_pm_addr_entry *entry); void mptcp_free_local_addr_list(struct mptcp_sock *msk); int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info); int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 8a9656248b0f..d4258869ac48 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -301,9 +301,9 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_BINDTOIFINDEX: lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { + if (IS_ERR(ssock)) { release_sock(sk); - return -EINVAL; + return PTR_ERR(ssock); } ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); @@ -396,9 +396,9 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, case IPV6_FREEBIND: lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { + if (IS_ERR(ssock)) { release_sock(sk); - return -EINVAL; + return PTR_ERR(ssock); } ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); @@ -693,9 +693,9 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { + if (IS_ERR(ssock)) { release_sock(sk); - return -EINVAL; + return PTR_ERR(ssock); } issk = inet_sk(ssock->sk); @@ -762,13 +762,15 @@ static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int { struct sock *sk = (struct sock *)msk; struct socket *sock; - int ret = -EINVAL; + int ret; /* Limit to first subflow, before the connection establishment */ lock_sock(sk); sock = __mptcp_nmpc_socket(msk); - if (!sock) + if (IS_ERR(sock)) { + ret = PTR_ERR(sock); goto unlock; + } ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen); @@ -861,7 +863,7 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int { struct sock *sk = (struct sock *)msk; struct socket *ssock; - int ret = -EINVAL; + int ret; struct sock *ssk; lock_sock(sk); @@ -872,8 +874,10 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int } ssock = __mptcp_nmpc_socket(msk); - if (!ssock) + if (IS_ERR(ssock)) { + ret = PTR_ERR(ssock); goto out; + } ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen); @@ -885,7 +889,6 @@ out: void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) { u32 flags = 0; - u8 val; memset(info, 0, sizeof(*info)); @@ -893,12 +896,19 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); - info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk); - val = mptcp_pm_get_add_addr_signal_max(msk); - info->mptcpi_add_addr_signal_max = val; - val = mptcp_pm_get_add_addr_accept_max(msk); - info->mptcpi_add_addr_accepted_max = val; - info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); + + /* The following limits only make sense for the in-kernel PM */ + if (mptcp_pm_is_kernel(msk)) { + info->mptcpi_subflows_max = + mptcp_pm_get_subflows_max(msk); + info->mptcpi_add_addr_signal_max = + mptcp_pm_get_add_addr_signal_max(msk); + info->mptcpi_add_addr_accepted_max = + mptcp_pm_get_add_addr_accept_max(msk); + info->mptcpi_local_addr_max = + mptcp_pm_get_local_addr_max(msk); + } + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) flags |= MPTCP_INFO_FLAG_FALLBACK; if (READ_ONCE(msk->can_ack)) @@ -1046,7 +1056,7 @@ static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval, static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a) { - struct inet_sock *inet = inet_sk(sk); + const struct inet_sock *inet = inet_sk(sk); memset(a, 0, sizeof(*a)); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 4ae1a7304cf0..d9c8b21c6076 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -397,15 +397,19 @@ void mptcp_subflow_reset(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; + /* mptcp_mp_fail_no_response() can reach here on an already closed + * socket + */ + if (ssk->sk_state == TCP_CLOSE) + return; + /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); - tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); - if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && - schedule_work(&mptcp_sk(sk)->work)) - return; /* worker will put sk for us */ + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); sock_put(sk); } @@ -622,7 +626,7 @@ static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; -static struct proto tcpv6_prot_override; +static struct proto tcpv6_prot_override __ro_after_init; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { @@ -691,13 +695,6 @@ static bool subflow_hmac_valid(const struct request_sock *req, return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } -static void mptcp_force_close(struct sock *sk) -{ - /* the msk is not yet exposed to user-space */ - inet_sk_state_store(sk, TCP_CLOSE); - sk_common_release(sk); -} - static void subflow_ulp_fallback(struct sock *sk, struct mptcp_subflow_context *old_ctx) { @@ -711,16 +708,19 @@ static void subflow_ulp_fallback(struct sock *sk, mptcp_subflow_ops_undo_override(sk); } -static void subflow_drop_ctx(struct sock *ssk) +void mptcp_subflow_drop_ctx(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); if (!ctx) return; - subflow_ulp_fallback(ssk, ctx); - if (ctx->conn) - sock_put(ctx->conn); + list_del(&mptcp_subflow_ctx(ssk)->node); + if (inet_csk(ssk)->icsk_ulp_ops) { + subflow_ulp_fallback(ssk, ctx); + if (ctx->conn) + sock_put(ctx->conn); + } kfree_rcu(ctx, rcu); } @@ -749,7 +749,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; - struct sock *new_msk = NULL; + struct mptcp_sock *owner; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); @@ -777,14 +777,9 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * options. */ mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { + if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) fallback = true; - goto create_child; - } - new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req); - if (!new_msk) - fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) || @@ -813,49 +808,29 @@ create_child: subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } - - if (new_msk) - mptcp_copy_inaddrs(new_msk, child); - subflow_drop_ctx(child); - goto out; + goto fallback; } /* ssk inherits options of listener sk */ ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { - /* this can't race with mptcp_close(), as the msk is - * not yet exposted to user-space - */ - inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); - - /* record the newly created socket as the first msk - * subflow, but don't link it yet into conn_list - */ - WRITE_ONCE(mptcp_sk(new_msk)->first, child); + ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); + if (!ctx->conn) + goto fallback; - /* new mpc subflow takes ownership of the newly - * created mptcp socket - */ - mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq; - mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1); - mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); - ctx->conn = new_msk; - new_msk = NULL; - - /* set msk addresses early to ensure mptcp_pm_get_local_id() - * uses the correct data - */ - mptcp_copy_inaddrs(ctx->conn, child); + owner = mptcp_sk(ctx->conn); + mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress * mpc option */ - if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) + if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { mptcp_subflow_fully_established(ctx, &mp_opt); + mptcp_pm_fully_established(owner, child); + ctx->pm_notified = 1; + } } else if (ctx->mp_join) { - struct mptcp_sock *owner; - owner = subflow_req->msk; if (!owner) { subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); @@ -885,11 +860,6 @@ create_child: } } -out: - /* dispose of the left over mptcp master, if any */ - if (unlikely(new_msk)) - mptcp_force_close(new_msk); - /* check for expected invariant - should never trigger, just help * catching eariler subtle bugs */ @@ -899,7 +869,7 @@ out: return child; dispose_child: - subflow_drop_ctx(child); + mptcp_subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); @@ -907,10 +877,14 @@ dispose_child: /* The last child reference will be released by the caller */ return child; + +fallback: + mptcp_subflow_drop_ctx(child); + return child; } static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; -static struct proto tcp_prot_override; +static struct proto tcp_prot_override __ro_after_init; enum mapping_status { MAPPING_OK, @@ -1103,8 +1077,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk, skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; } else { - if (updated && schedule_work(&msk->work)) - sock_hold((struct sock *)msk); + if (updated) + mptcp_schedule_work((struct sock *)msk); return MAPPING_DATA_FIN; } @@ -1207,17 +1181,12 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, /* sched mptcp worker to remove the subflow if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { - struct sock *sk = (struct sock *)msk; - if (likely(ssk->sk_state != TCP_CLOSE)) return; if (skb_queue_empty(&ssk->sk_receive_queue) && - !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) { - sock_hold(sk); - if (!schedule_work(&msk->work)) - sock_put(sk); - } + !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + mptcp_schedule_work((struct sock *)msk); } static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) @@ -1335,7 +1304,7 @@ fallback: subflow->reset_reason = MPTCP_RST_EMPTCP; reset: - ssk->sk_err = EBADMSG; + WRITE_ONCE(ssk->sk_err, EBADMSG); tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); @@ -1419,7 +1388,7 @@ void __mptcp_error_report(struct sock *sk) ssk_state = inet_sk_state_load(ssk); if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) inet_sk_state_store(sk, ssk_state); - sk->sk_err = -err; + WRITE_ONCE(sk->sk_err, -err); /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); @@ -1432,6 +1401,13 @@ static void subflow_error_report(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; + /* bail early if this is a no-op, so that we avoid introducing a + * problematic lockdep dependency between TCP accept queue lock + * and msk socket spinlock + */ + if (!sk->sk_socket) + return; + mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); @@ -1773,14 +1749,16 @@ static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; + struct mptcp_sock *msk; __subflow_state_change(sk); + msk = mptcp_sk(parent); if (subflow_simultaneous_connect(sk)) { mptcp_propagate_sndbuf(parent, sk); mptcp_do_fallback(sk); - mptcp_rcv_space_init(mptcp_sk(parent), sk); - pr_fallback(mptcp_sk(parent)); + mptcp_rcv_space_init(msk, sk); + pr_fallback(msk); subflow->conn_finished = 1; mptcp_set_connected(parent); } @@ -1796,11 +1774,12 @@ static void subflow_state_change(struct sock *sk) subflow_sched_work_if_closed(mptcp_sk(parent), sk); - if (__mptcp_check_fallback(mptcp_sk(parent)) && - !subflow->rx_eof && subflow_is_done(sk)) { - subflow->rx_eof = 1; - mptcp_subflow_eof(parent); - } + /* when the fallback subflow closes the rx side, trigger a 'dummy' + * ingress data fin, so that the msk state will follow along + */ + if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && + mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) + mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) @@ -1808,13 +1787,13 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct mptcp_sock *msk, *next, *head = NULL; struct request_sock *req; + struct sock *sk; /* build a list of all unaccepted mptcp sockets */ spin_lock_bh(&queue->rskq_lock); for (req = queue->rskq_accept_head; req; req = req->dl_next) { struct mptcp_subflow_context *subflow; struct sock *ssk = req->sk; - struct mptcp_sock *msk; if (!sk_is_mptcp(ssk)) continue; @@ -1824,10 +1803,12 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s continue; /* skip if already in list */ - msk = mptcp_sk(subflow->conn); + sk = subflow->conn; + msk = mptcp_sk(sk); if (msk->dl_next || msk == head) continue; + sock_hold(sk); msk->dl_next = head; head = msk; } @@ -1841,34 +1822,30 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s release_sock(listener_ssk); for (msk = head; msk; msk = next) { - struct sock *sk = (struct sock *)msk; - bool do_cancel_work; + sk = (struct sock *)msk; - sock_hold(sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); next = msk->dl_next; - msk->first = NULL; msk->dl_next = NULL; - do_cancel_work = __mptcp_close(sk, 0); + __mptcp_unaccepted_force_close(sk); release_sock(sk); - if (do_cancel_work) { - /* lockdep will report a false positive ABBA deadlock - * between cancel_work_sync and the listener socket. - * The involved locks belong to different sockets WRT - * the existing AB chain. - * Using a per socket key is problematic as key - * deregistration requires process context and must be - * performed at socket disposal time, in atomic - * context. - * Just tell lockdep to consider the listener socket - * released here. - */ - mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); - mptcp_cancel_work(sk); - mutex_acquire(&listener_sk->sk_lock.dep_map, - SINGLE_DEPTH_NESTING, 0, _RET_IP_); - } + + /* lockdep will report a false positive ABBA deadlock + * between cancel_work_sync and the listener socket. + * The involved locks belong to different sockets WRT + * the existing AB chain. + * Using a per socket key is problematic as key + * deregistration requires process context and must be + * performed at socket disposal time, in atomic + * context. + * Just tell lockdep to consider the listener socket + * released here. + */ + mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); + mptcp_cancel_work(sk); + mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); + sock_put(sk); } @@ -1932,6 +1909,13 @@ static void subflow_ulp_release(struct sock *ssk) * when the subflow is still unaccepted */ release = ctx->disposable || list_empty(&ctx->node); + + /* inet_child_forget() does not call sk_state_change(), + * explicitly trigger the socket close machinery + */ + if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, + &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); sock_put(sk); } |