diff options
Diffstat (limited to 'net/mptcp/subflow.c')
| -rw-r--r-- | net/mptcp/subflow.c | 352 |
1 files changed, 219 insertions, 133 deletions
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 39e2cbdf3801..86ce58ae533d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -39,7 +39,7 @@ static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - pr_debug("subflow_req=%p", subflow_req); + pr_debug("subflow_req=%p\n", subflow_req); if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); @@ -100,6 +100,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) return NULL; } subflow_req->local_id = local_id; + subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req); return msk; } @@ -131,6 +132,13 @@ static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) } } +static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb) +{ + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + return -EPERM; +} + /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset @@ -145,7 +153,7 @@ static int subflow_check_req(struct request_sock *req, struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; - pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + pr_debug("subflow_req=%p, listener=%p\n", subflow_req, listener); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -164,10 +172,17 @@ static int subflow_check_req(struct request_sock *req, if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); + if (unlikely(listener->pm_listener)) + return subflow_reset_req_endp(req, skb); if (opt_mp_join) return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); + + if (mp_opt.backup) + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); + } else if (unlikely(listener->pm_listener)) { + return subflow_reset_req_endp(req, skb); } if (opt_mp_capable && listener->request_mptcp) { @@ -217,7 +232,7 @@ again: } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { - pr_debug("syn inet_sport=%d %d", + pr_debug("syn inet_sport=%d %d\n", ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { @@ -232,6 +247,7 @@ again: if (unlikely(req->syncookie)) { if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } @@ -239,7 +255,7 @@ again: subflow_init_req_cookie_join_save(subflow_req, skb); } - pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, + pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } @@ -475,6 +491,9 @@ static void subflow_set_remote_key(struct mptcp_sock *msk, mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); subflow->iasn++; + /* for fallback's sake */ + subflow->map_seq = subflow->iasn; + WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->ack_seq, subflow->iasn); WRITE_ONCE(msk->can_ack, true); @@ -523,15 +542,18 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); + pr_debug("subflow=%p synack seq=%x\n", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { - MPTCP_INC_STATS(sock_net(sk), - MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); - mptcp_do_fallback(sk); - pr_fallback(msk); + if (!mptcp_try_fallback(sk, + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK)) { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_FALLBACKFAILED); + goto do_reset; + } + goto fallback; } @@ -542,6 +564,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_capable = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); + mptcp_active_enable(parent); mptcp_propagate_state(parent, sk, subflow, &mp_opt); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; @@ -555,7 +578,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; WRITE_ONCE(subflow->remote_id, mp_opt.join_id); - pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); @@ -577,13 +600,19 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); + if (subflow->backup) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); + if (subflow_use_different_dport(msk, sk)) { - pr_debug("synack inet_dport=%d %d", + pr_debug("synack inet_dport=%d %d\n", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); } } else if (mptcp_check_fallback(sk)) { + /* It looks like MPTCP is blocked, while TCP is not */ + if (subflow->mpc_drop) + mptcp_active_disable(parent); fallback: mptcp_propagate_state(parent, sk, subflow, NULL); } @@ -614,6 +643,8 @@ static int subflow_chk_local_id(struct sock *sk) return err; subflow_set_local_id(subflow, err); + subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk); + return 0; } @@ -646,7 +677,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); /* Never answer to SYNs sent to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -677,7 +708,7 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); if (skb->protocol == htons(ETH_P_IP)) return subflow_v4_conn_request(sk, skb); @@ -721,17 +752,11 @@ struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *op EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ -static bool subflow_hmac_valid(const struct request_sock *req, +static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req, const struct mptcp_options_received *mp_opt) { - const struct mptcp_subflow_request_sock *subflow_req; + struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; - struct mptcp_sock *msk; - - subflow_req = mptcp_subflow_rsk(req); - msk = subflow_req->msk; - if (!msk) - return false; subflow_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), @@ -776,11 +801,8 @@ void __mptcp_subflow_fully_established(struct mptcp_sock *msk, const struct mptcp_options_received *mp_opt) { subflow_set_remote_key(msk, subflow, mp_opt); - subflow->fully_established = 1; + WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); - - if (subflow->is_mptfo) - __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, @@ -798,7 +820,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_sock *owner; struct sock *child; - pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); + pr_debug("listener=%p, req=%p, conn=%p\n", listener, req, listener->conn); /* After child creation we must look for MPC even when options * are not parsed @@ -829,12 +851,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) || - !subflow_hmac_valid(req, &mp_opt) || - !mptcp_can_accept_new_subflow(subflow_req->msk)) { - SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK)) fallback = true; - } } create_child: @@ -868,6 +886,10 @@ create_child: ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); + + if (mp_opt.deny_join_id0) + WRITE_ONCE(owner->pm.remote_deny_join_id0, true); + mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress @@ -884,12 +906,24 @@ create_child: goto dispose_child; } + if (!subflow_hmac_valid(subflow_req, &mp_opt)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + goto dispose_child; + } + + if (!mptcp_can_accept_new_subflow(owner)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + goto dispose_child; + } + /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; if (subflow_use_different_sport(owner, sk)) { - pr_debug("ack inet_sport=%d %d", + pr_debug("ack inet_sport=%d %d\n", ntohs(inet_sk(sk)->inet_sport), ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { @@ -947,12 +981,13 @@ enum mapping_status { MAPPING_EMPTY, MAPPING_DATA_FIN, MAPPING_DUMMY, - MAPPING_BAD_CSUM + MAPPING_BAD_CSUM, + MAPPING_NODSS }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { - pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n", ssn, subflow->map_subflow_seq, subflow->map_data_len); } @@ -962,8 +997,10 @@ static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) unsigned int skb_consumed; skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; - if (WARN_ON_ONCE(skb_consumed >= skb->len)) + if (unlikely(skb_consumed >= skb->len)) { + DEBUG_NET_WARN_ON_ONCE(1); return true; + } return skb->len - skb_consumed <= subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); @@ -1102,8 +1139,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk, return MAPPING_EMPTY; } + /* If the required DSS has likely been dropped by a middlebox */ if (!subflow->map_valid) - return MAPPING_INVALID; + return MAPPING_NODSS; goto validate_seq; } @@ -1112,9 +1150,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len = mpext->data_len; if (data_len == 0) { - pr_debug("infinite mapping received"); + pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); - subflow->map_data_len = 0; return MAPPING_INVALID; } @@ -1124,7 +1161,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); - pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); + pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping @@ -1150,7 +1187,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_fin_seq &= GENMASK_ULL(31, 0); mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); - pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", + pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n", data_fin_seq, mpext->dsn64); /* Adjust for DATA_FIN using 1 byte of sequence space */ @@ -1196,7 +1233,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (unlikely(subflow->map_csum_reqd != csum_reqd)) return MAPPING_INVALID; - pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len, subflow->map_csum_reqd, subflow->map_data_csum); @@ -1221,57 +1258,83 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - u32 incr; + struct tcp_sock *tp = tcp_sk(ssk); + u32 offset, incr, avail_len; + + offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; + if (WARN_ON_ONCE(offset > skb->len)) + goto out; - incr = limit >= skb->len ? skb->len + fin : limit; + avail_len = skb->len - offset; + incr = limit >= avail_len ? avail_len + fin : limit; - pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, - subflow->map_subflow_seq); + pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr, skb->len, + offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; + +out: if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; } -/* sched mptcp worker to remove the subflow if no more data is pending */ +static bool subflow_is_done(const struct sock *sk) +{ + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; +} + +/* sched mptcp worker for subflow cleanup if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { - if (likely(ssk->sk_state != TCP_CLOSE)) + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = (struct sock *)msk; + + if (likely(ssk->sk_state != TCP_CLOSE && + (ssk->sk_state != TCP_CLOSE_WAIT || + inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; - if (skb_queue_empty(&ssk->sk_receive_queue) && - !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - mptcp_schedule_work((struct sock *)msk); -} + if (!skb_queue_empty(&ssk->sk_receive_queue)) + return; -static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) -{ - struct mptcp_sock *msk = mptcp_sk(subflow->conn); + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + mptcp_schedule_work(sk); - if (subflow->mp_join) - return false; - else if (READ_ONCE(msk->csum_enabled)) - return !subflow->valid_csum_seen; - else - return !subflow->fully_established; + /* when the fallback subflow closes the rx side, trigger a 'dummy' + * ingress data fin, so that the msk state will follow along + */ + if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && + msk->first == ssk && + mptcp_update_rcv_data_fin(msk, subflow->map_seq + + subflow->map_data_len, true)) + mptcp_schedule_work(sk); } -static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) +static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned long fail_tout; + /* we are really failing, prevent any later subflow join */ + spin_lock_bh(&msk->fallback_lock); + if (!msk->allow_infinite_fallback) { + spin_unlock_bh(&msk->fallback_lock); + return false; + } + msk->allow_subflows = false; + spin_unlock_bh(&msk->fallback_lock); + /* graceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) - return; + return false; /* since the close timeout take precedence on the fail one, * no need to start the latter when the first is already set */ if (sock_flag((struct sock *)msk, SOCK_DEAD)) - return; + return true; /* we don't need extreme accuracy here, use a zero fail_tout as special * value meaning no fail timeout at all; @@ -1283,6 +1346,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) tcp_send_ack(ssk); mptcp_reset_tout_timer(msk, subflow->fail_tout); + return true; } static bool subflow_check_data_avail(struct sock *ssk) @@ -1305,7 +1369,7 @@ static bool subflow_check_data_avail(struct sock *ssk) status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || - status == MAPPING_BAD_CSUM)) + status == MAPPING_BAD_CSUM || status == MAPPING_NODSS)) goto fallback; if (status != MAPPING_OK) @@ -1320,7 +1384,7 @@ static bool subflow_check_data_avail(struct sock *ssk) old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); - pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, + pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack, ack_seq); if (unlikely(before64(ack_seq, old_ack))) { mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); @@ -1343,22 +1407,23 @@ fallback: (subflow->mp_join || subflow->valid_csum_seen)) { subflow->send_mp_fail = 1; - if (!READ_ONCE(msk->allow_infinite_fallback)) { + if (!mptcp_subflow_fail(msk, ssk)) { subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; goto reset; } - mptcp_subflow_fail(msk, ssk); WRITE_ONCE(subflow->data_avail, true); return true; } - if (!subflow_can_fallback(subflow) && subflow->map_data_len) { + if (!mptcp_try_fallback(ssk, MPTCP_MIB_DSSFALLBACK)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ subflow->reset_transient = 0; - subflow->reset_reason = MPTCP_RST_EMPTCP; + subflow->reset_reason = status == MAPPING_NODSS ? + MPTCP_RST_EMIDDLEBOX : + MPTCP_RST_EMPTCP; reset: WRITE_ONCE(ssk->sk_err, EBADMSG); @@ -1369,15 +1434,16 @@ reset: WRITE_ONCE(subflow->data_avail, false); return false; } - - mptcp_do_fallback(ssk); } skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; - subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; + subflow->map_seq = __mptcp_expand_seq(subflow->map_seq, + subflow->iasn + + TCP_SKB_CB(skb)->seq - + subflow->ssn_offset - 1); WRITE_ONCE(subflow->data_avail, true); return true; } @@ -1392,7 +1458,7 @@ bool mptcp_subflow_data_available(struct sock *sk) subflow->map_valid = 0; WRITE_ONCE(subflow->data_avail, false); - pr_debug("Done with mapping: seq=%u data_len=%u", + pr_debug("Done with mapping: seq=%u data_len=%u\n", subflow->map_subflow_seq, subflow->map_data_len); } @@ -1502,7 +1568,7 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped) target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); - pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", + pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n", subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) @@ -1544,28 +1610,31 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, #endif } -int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, +int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; + int local_id = local->addr.id; struct sockaddr_storage addr; int remote_id = remote->id; - int local_id = loc->id; int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; - int ifindex; - u8 flags; + /* The userspace PM sent the request too early? */ if (!mptcp_is_fully_established(sk)) goto err_out; - err = mptcp_subflow_create_socket(sk, loc->family, &sf); - if (err) + err = mptcp_subflow_create_socket(sk, local->addr.family, &sf); + if (err) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCREATSKERR); + pr_debug("msk=%p local=%d remote=%d create sock error: %d\n", + msk, local_id, remote_id, err); goto err_out; + } ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); @@ -1573,47 +1642,65 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); - if (local_id) + /* if 'IPADDRANY', the ID will be set later, after the routing */ + if (local->addr.family == AF_INET) { + if (!local->addr.addr.s_addr) + local_id = -1; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + } else if (sk->sk_family == AF_INET6) { + if (ipv6_addr_any(&local->addr.addr6)) + local_id = -1; +#endif + } + + if (local_id >= 0) subflow_set_local_id(subflow, local_id); - mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, - &flags, &ifindex); subflow->remote_key_valid = 1; subflow->remote_key = READ_ONCE(msk->remote_key); subflow->local_key = READ_ONCE(msk->local_key); subflow->token = msk->token; - mptcp_info2sockaddr(loc, &addr, ssk->sk_family); + mptcp_info2sockaddr(&local->addr, &addr, ssk->sk_family); addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - ssk->sk_bound_dev_if = ifindex; - err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); - if (err) + ssk->sk_bound_dev_if = local->ifindex; + err = kernel_bind(sf, (struct sockaddr_unsized *)&addr, addrlen); + if (err) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR); + pr_debug("msk=%p local=%d remote=%d bind error: %d\n", + msk, local_id, remote_id, err); goto failed; + } mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); - pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, + pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; WRITE_ONCE(subflow->remote_id, remote_id); subflow->request_join = 1; - subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); + subflow->request_bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); subflow->subflow_id = msk->subflow_id++; mptcp_info2sockaddr(remote, &addr, ssk->sk_family); sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); - err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); - if (err && err != -EINPROGRESS) + err = kernel_connect(sf, (struct sockaddr_unsized *)&addr, addrlen, O_NONBLOCK); + if (err && err != -EINPROGRESS) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR); + pr_debug("msk=%p local=%d remote=%d connect error: %d\n", + msk, local_id, remote_id, err); goto failed_unlink; + } + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTX); /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); - WRITE_ONCE(msk->allow_infinite_fallback, false); mptcp_stop_tout_timer(sk); return 0; @@ -1633,30 +1720,39 @@ err_out: return err; } -static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp) +{ + /* Only if the msk has been accepted already (and not orphaned).*/ + if (!mem_cgroup_sockets_enabled || !sk->sk_socket) + return; + + mem_cgroup_sk_inherit(sk, ssk); + __sk_charge(ssk, gfp); +} + +void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk) { #ifdef CONFIG_SOCK_CGROUP_DATA - struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, - *child_skcd = &child->sk_cgrp_data; + struct sock_cgroup_data *sk_cd = &sk->sk_cgrp_data, + *ssk_cd = &ssk->sk_cgrp_data; /* only the additional subflows created by kworkers have to be modified */ - if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != - cgroup_id(sock_cgroup_ptr(child_skcd))) { -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = parent->sk_memcg; - - mem_cgroup_sk_free(child); - if (memcg && css_tryget(&memcg->css)) - child->sk_memcg = memcg; -#endif /* CONFIG_MEMCG */ - - cgroup_sk_free(child_skcd); - *child_skcd = *parent_skcd; - cgroup_sk_clone(child_skcd); + if (cgroup_id(sock_cgroup_ptr(sk_cd)) != + cgroup_id(sock_cgroup_ptr(ssk_cd))) { + cgroup_sk_free(ssk_cd); + *ssk_cd = *sk_cd; + cgroup_sk_clone(sk_cd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ } +static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +{ + __mptcp_inherit_cgrp_data(parent, child); + if (mem_cgroup_sockets_enabled) + mem_cgroup_sk_inherit(parent, child); +} + static void mptcp_subflow_ops_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1708,10 +1804,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ - __netns_tracker_free(net, &sf->sk->ns_tracker, false); - sf->sk->sk_net_refcnt = 1; - get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sf->sk); err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free; @@ -1730,7 +1823,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; subflow = mptcp_subflow_ctx(sf->sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); *new_sock = sf; sock_hold(sk); @@ -1759,7 +1852,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, INIT_LIST_HEAD(&ctx->node); INIT_LIST_HEAD(&ctx->delegated_node); - pr_debug("subflow=%p", ctx); + pr_debug("subflow=%p\n", ctx); ctx->tcp_sock = sk; WRITE_ONCE(ctx->local_id, -1); @@ -1778,23 +1871,15 @@ static void __subflow_state_change(struct sock *sk) rcu_read_unlock(); } -static bool subflow_is_done(const struct sock *sk) -{ - return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; -} - static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; - struct mptcp_sock *msk; __subflow_state_change(sk); - msk = mptcp_sk(parent); if (subflow_simultaneous_connect(sk)) { - mptcp_do_fallback(sk); - pr_fallback(msk); + WARN_ON_ONCE(!mptcp_try_fallback(sk, MPTCP_MIB_SIMULTCONNFALLBACK)); subflow->conn_finished = 1; mptcp_propagate_state(parent, sk, subflow, NULL); } @@ -1809,13 +1894,6 @@ static void subflow_state_change(struct sock *sk) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); - - /* when the fallback subflow closes the rx side, trigger a 'dummy' - * ingress data fin, so that the msk state will follow along - */ - if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) - mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) @@ -1910,7 +1988,7 @@ static int subflow_ulp_init(struct sock *sk) goto out; } - pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); + pr_debug("subflow=%p, family=%d\n", ctx, sk->sk_family); tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; @@ -1985,7 +2063,6 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; - new_ctx->tcp_sock = newsk; if (subflow_req->mp_capable) { /* see comments in subflow_syn_recv_sock(), MPTCP connection @@ -2002,9 +2079,10 @@ static void subflow_ulp_clone(const struct request_sock *req, } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; - new_ctx->fully_established = 1; + WRITE_ONCE(new_ctx->fully_established, true); new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; + new_ctx->request_bkup = subflow_req->request_bkup; WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; @@ -2088,6 +2166,10 @@ void __init mptcp_subflow_init(void) tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; tcp_prot_override.diag_destroy = tcp_abort_override; +#ifdef CONFIG_BPF_SYSCALL + /* Disable sockmap processing for subflows */ + tcp_prot_override.psock_update_sk_prot = NULL; +#endif #if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock @@ -2124,6 +2206,10 @@ void __init mptcp_subflow_init(void) tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; tcpv6_prot_override.diag_destroy = tcp_abort_override; +#ifdef CONFIG_BPF_SYSCALL + /* Disable sockmap processing for subflows */ + tcpv6_prot_override.psock_update_sk_prot = NULL; +#endif #endif mptcp_diag_subflow_init(&subflow_ulp_ops); |
