diff options
Diffstat (limited to 'net/mptcp/subflow.c')
-rw-r--r-- | net/mptcp/subflow.c | 340 |
1 files changed, 211 insertions, 129 deletions
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1626dd20c68f..15613d691bfe 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -20,6 +20,7 @@ #include <net/transp_v6.h> #endif #include <net/mptcp.h> + #include "protocol.h" #include "mib.h" @@ -38,7 +39,7 @@ static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - pr_debug("subflow_req=%p", subflow_req); + pr_debug("subflow_req=%p\n", subflow_req); if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); @@ -99,6 +100,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) return NULL; } subflow_req->local_id = local_id; + subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req); return msk; } @@ -130,6 +132,13 @@ static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) } } +static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb) +{ + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + return -EPERM; +} + /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset @@ -144,14 +153,16 @@ static int subflow_check_req(struct request_sock *req, struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; - pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + pr_debug("subflow_req=%p, listener=%p\n", subflow_req, listener); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ - if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EINVAL; + } #endif mptcp_get_options(skb, &mp_opt); @@ -161,10 +172,17 @@ static int subflow_check_req(struct request_sock *req, if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); + if (unlikely(listener->pm_listener)) + return subflow_reset_req_endp(req, skb); if (opt_mp_join) return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); + + if (mp_opt.backup) + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); + } else if (unlikely(listener->pm_listener)) { + return subflow_reset_req_endp(req, skb); } if (opt_mp_capable && listener->request_mptcp) { @@ -214,11 +232,12 @@ again: } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { - pr_debug("syn inet_sport=%d %d", + pr_debug("syn inet_sport=%d %d\n", ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); @@ -227,13 +246,16 @@ again: subflow_req_create_thmac(subflow_req); if (unlikely(req->syncookie)) { - if (mptcp_can_accept_new_subflow(subflow_req->msk)) - subflow_init_req_cookie_join_save(subflow_req, skb); - else + if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; + } + + subflow_init_req_cookie_join_save(subflow_req, skb); } - pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, + pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } @@ -281,10 +303,21 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); +static enum sk_rst_reason mptcp_get_rst_reason(const struct sk_buff *skb) +{ + const struct mptcp_ext *mpext = mptcp_get_ext(skb); + + if (!mpext) + return SK_RST_REASON_NOT_SPECIFIED; + + return sk_rst_convert_mptcp_reason(mpext->reset_reason); +} + static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { struct dst_entry *dst; int err; @@ -292,7 +325,7 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); - dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); + dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; @@ -302,7 +335,8 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, dst_release(dst); if (!req->syncookie) - tcp_request_sock_ops.send_reset(sk, skb); + tcp_request_sock_ops.send_reset(sk, skb, + mptcp_get_rst_reason(skb)); return NULL; } @@ -351,7 +385,8 @@ static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { struct dst_entry *dst; int err; @@ -359,7 +394,7 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); - dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); + dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; @@ -369,7 +404,8 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, dst_release(dst); if (!req->syncookie) - tcp6_request_sock_ops.send_reset(sk, skb); + tcp6_request_sock_ops.send_reset(sk, skb, + mptcp_get_rst_reason(skb)); return NULL; } #endif @@ -405,7 +441,7 @@ void mptcp_subflow_reset(struct sock *ssk) /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); - tcp_send_active_reset(ssk, GFP_ATOMIC); + mptcp_send_active_reset_reason(ssk); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); @@ -503,7 +539,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); + pr_debug("subflow=%p synack seq=%x\n", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { @@ -522,6 +558,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_capable = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); + mptcp_active_enable(parent); mptcp_propagate_state(parent, sk, subflow, &mp_opt); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; @@ -535,7 +572,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; WRITE_ONCE(subflow->remote_id, mp_opt.join_id); - pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); @@ -557,13 +594,19 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); + if (subflow->backup) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); + if (subflow_use_different_dport(msk, sk)) { - pr_debug("synack inet_dport=%d %d", + pr_debug("synack inet_dport=%d %d\n", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); } } else if (mptcp_check_fallback(sk)) { + /* It looks like MPTCP is blocked, while TCP is not */ + if (subflow->mpc_drop) + mptcp_active_disable(parent); fallback: mptcp_propagate_state(parent, sk, subflow, NULL); } @@ -594,6 +637,8 @@ static int subflow_chk_local_id(struct sock *sk) return err; subflow_set_local_id(subflow, err); + subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk); + return 0; } @@ -626,7 +671,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); /* Never answer to SYNs sent to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -657,7 +702,7 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); if (skb->protocol == htons(ETH_P_IP)) return subflow_v4_conn_request(sk, skb); @@ -701,17 +746,11 @@ struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *op EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ -static bool subflow_hmac_valid(const struct request_sock *req, +static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req, const struct mptcp_options_received *mp_opt) { - const struct mptcp_subflow_request_sock *subflow_req; + struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; - struct mptcp_sock *msk; - - subflow_req = mptcp_subflow_rsk(req); - msk = subflow_req->msk; - if (!msk) - return false; subflow_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), @@ -756,11 +795,8 @@ void __mptcp_subflow_fully_established(struct mptcp_sock *msk, const struct mptcp_options_received *mp_opt) { subflow_set_remote_key(msk, subflow, mp_opt); - subflow->fully_established = 1; + WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); - - if (subflow->is_mptfo) - __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, @@ -774,10 +810,11 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; + enum sk_rst_reason reason; struct mptcp_sock *owner; struct sock *child; - pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); + pr_debug("listener=%p, req=%p, conn=%p\n", listener, req, listener->conn); /* After child creation we must look for MPC even when options * are not parsed @@ -808,12 +845,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) || - !subflow_hmac_valid(req, &mp_opt) || - !mptcp_can_accept_new_subflow(subflow_req->msk)) { - SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK)) fallback = true; - } } create_child: @@ -863,23 +896,40 @@ create_child: goto dispose_child; } + if (!subflow_hmac_valid(subflow_req, &mp_opt)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + goto dispose_child; + } + + if (!mptcp_can_accept_new_subflow(owner)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + goto dispose_child; + } + /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; if (subflow_use_different_sport(owner, sk)) { - pr_debug("ack inet_sport=%d %d", + pr_debug("ack inet_sport=%d %d\n", ntohs(inet_sk(sk)->inet_sport), ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); } - if (!mptcp_finish_join(child)) + if (!mptcp_finish_join(child)) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(child); + + subflow_add_reset_reason(skb, subflow->reset_reason); goto dispose_child; + } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; @@ -887,7 +937,7 @@ create_child: } /* check for expected invariant - should never trigger, just help - * catching eariler subtle bugs + * catching earlier subtle bugs */ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && (!mptcp_subflow_ctx(child) || @@ -899,12 +949,15 @@ dispose_child: tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); - req->rsk_ops->send_reset(sk, skb); + reason = mptcp_get_rst_reason(skb); + req->rsk_ops->send_reset(sk, skb, reason); /* The last child reference will be released by the caller */ return child; fallback: + if (fallback) + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); mptcp_subflow_drop_ctx(child); return child; } @@ -918,12 +971,13 @@ enum mapping_status { MAPPING_EMPTY, MAPPING_DATA_FIN, MAPPING_DUMMY, - MAPPING_BAD_CSUM + MAPPING_BAD_CSUM, + MAPPING_NODSS }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { - pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n", ssn, subflow->map_subflow_seq, subflow->map_data_len); } @@ -933,8 +987,10 @@ static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) unsigned int skb_consumed; skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; - if (WARN_ON_ONCE(skb_consumed >= skb->len)) + if (unlikely(skb_consumed >= skb->len)) { + DEBUG_NET_WARN_ON_ONCE(1); return true; + } return skb->len - skb_consumed <= subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); @@ -1073,8 +1129,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk, return MAPPING_EMPTY; } + /* If the required DSS has likely been dropped by a middlebox */ if (!subflow->map_valid) - return MAPPING_INVALID; + return MAPPING_NODSS; goto validate_seq; } @@ -1083,17 +1140,18 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len = mpext->data_len; if (data_len == 0) { - pr_debug("infinite mapping received"); + pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); - subflow->map_data_len = 0; return MAPPING_INVALID; } if (mpext->data_fin == 1) { + u64 data_fin_seq; + if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); - pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); + pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping @@ -1102,26 +1160,26 @@ static enum mapping_status get_mapping_status(struct sock *ssk, */ skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; - } else { - if (updated) - mptcp_schedule_work((struct sock *)msk); - - return MAPPING_DATA_FIN; } - } else { - u64 data_fin_seq = mpext->data_seq + data_len - 1; - /* If mpext->data_seq is a 32-bit value, data_fin_seq - * must also be limited to 32 bits. - */ - if (!mpext->dsn64) - data_fin_seq &= GENMASK_ULL(31, 0); + if (updated) + mptcp_schedule_work((struct sock *)msk); - mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); - pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", - data_fin_seq, mpext->dsn64); + return MAPPING_DATA_FIN; } + data_fin_seq = mpext->data_seq + data_len - 1; + + /* If mpext->data_seq is a 32-bit value, data_fin_seq must also + * be limited to 32 bits. + */ + if (!mpext->dsn64) + data_fin_seq &= GENMASK_ULL(31, 0); + + mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); + pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n", + data_fin_seq, mpext->dsn64); + /* Adjust for DATA_FIN using 1 byte of sequence space */ data_len--; } @@ -1165,7 +1223,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (unlikely(subflow->map_csum_reqd != csum_reqd)) return MAPPING_INVALID; - pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len, subflow->map_csum_reqd, subflow->map_data_csum); @@ -1190,41 +1248,56 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - u32 incr; + struct tcp_sock *tp = tcp_sk(ssk); + u32 offset, incr, avail_len; + + offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; + if (WARN_ON_ONCE(offset > skb->len)) + goto out; - incr = limit >= skb->len ? skb->len + fin : limit; + avail_len = skb->len - offset; + incr = limit >= avail_len ? avail_len + fin : limit; - pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, - subflow->map_subflow_seq); + pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr, skb->len, + offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; + +out: if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; } -/* sched mptcp worker to remove the subflow if no more data is pending */ +static bool subflow_is_done(const struct sock *sk) +{ + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; +} + +/* sched mptcp worker for subflow cleanup if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { - if (likely(ssk->sk_state != TCP_CLOSE)) + struct sock *sk = (struct sock *)msk; + + if (likely(ssk->sk_state != TCP_CLOSE && + (ssk->sk_state != TCP_CLOSE_WAIT || + inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; - if (skb_queue_empty(&ssk->sk_receive_queue) && - !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - mptcp_schedule_work((struct sock *)msk); -} + if (!skb_queue_empty(&ssk->sk_receive_queue)) + return; -static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) -{ - struct mptcp_sock *msk = mptcp_sk(subflow->conn); + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + mptcp_schedule_work(sk); - if (subflow->mp_join) - return false; - else if (READ_ONCE(msk->csum_enabled)) - return !subflow->valid_csum_seen; - else - return !subflow->fully_established; + /* when the fallback subflow closes the rx side, trigger a 'dummy' + * ingress data fin, so that the msk state will follow along + */ + if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && + msk->first == ssk && + mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) + mptcp_schedule_work(sk); } static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) @@ -1232,7 +1305,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned long fail_tout; - /* greceful failure can happen only on the MPC subflow */ + /* graceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) return; @@ -1274,7 +1347,7 @@ static bool subflow_check_data_avail(struct sock *ssk) status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || - status == MAPPING_BAD_CSUM)) + status == MAPPING_BAD_CSUM || status == MAPPING_NODSS)) goto fallback; if (status != MAPPING_OK) @@ -1289,7 +1362,7 @@ static bool subflow_check_data_avail(struct sock *ssk) old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); - pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, + pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack, ack_seq); if (unlikely(before64(ack_seq, old_ack))) { mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); @@ -1322,19 +1395,21 @@ fallback: return true; } - if (!subflow_can_fallback(subflow) && subflow->map_data_len) { + if (!READ_ONCE(msk->allow_infinite_fallback)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ subflow->reset_transient = 0; - subflow->reset_reason = MPTCP_RST_EMPTCP; + subflow->reset_reason = status == MAPPING_NODSS ? + MPTCP_RST_EMIDDLEBOX : + MPTCP_RST_EMPTCP; reset: WRITE_ONCE(ssk->sk_err, EBADMSG); tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); - tcp_send_active_reset(ssk, GFP_ATOMIC); + mptcp_send_active_reset_reason(ssk); WRITE_ONCE(subflow->data_avail, false); return false; } @@ -1361,7 +1436,7 @@ bool mptcp_subflow_data_available(struct sock *sk) subflow->map_valid = 0; WRITE_ONCE(subflow->data_avail, false); - pr_debug("Done with mapping: seq=%u data_len=%u", + pr_debug("Done with mapping: seq=%u data_len=%u\n", subflow->map_subflow_seq, subflow->map_data_len); } @@ -1471,7 +1546,7 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped) target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); - pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", + pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n", subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) @@ -1513,28 +1588,31 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, #endif } -int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, +int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; + int local_id = local->addr.id; struct sockaddr_storage addr; int remote_id = remote->id; - int local_id = loc->id; int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; - int ifindex; - u8 flags; + /* The userspace PM sent the request too early? */ if (!mptcp_is_fully_established(sk)) goto err_out; - err = mptcp_subflow_create_socket(sk, loc->family, &sf); - if (err) + err = mptcp_subflow_create_socket(sk, local->addr.family, &sf); + if (err) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCREATSKERR); + pr_debug("msk=%p local=%d remote=%d create sock error: %d\n", + msk, local_id, remote_id, err); goto err_out; + } ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); @@ -1542,42 +1620,61 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); - if (local_id) + /* if 'IPADDRANY', the ID will be set later, after the routing */ + if (local->addr.family == AF_INET) { + if (!local->addr.addr.s_addr) + local_id = -1; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + } else if (sk->sk_family == AF_INET6) { + if (ipv6_addr_any(&local->addr.addr6)) + local_id = -1; +#endif + } + + if (local_id >= 0) subflow_set_local_id(subflow, local_id); - mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, - &flags, &ifindex); subflow->remote_key_valid = 1; subflow->remote_key = READ_ONCE(msk->remote_key); subflow->local_key = READ_ONCE(msk->local_key); subflow->token = msk->token; - mptcp_info2sockaddr(loc, &addr, ssk->sk_family); + mptcp_info2sockaddr(&local->addr, &addr, ssk->sk_family); addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - ssk->sk_bound_dev_if = ifindex; + ssk->sk_bound_dev_if = local->ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); - if (err) + if (err) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR); + pr_debug("msk=%p local=%d remote=%d bind error: %d\n", + msk, local_id, remote_id, err); goto failed; + } mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); - pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, + pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; WRITE_ONCE(subflow->remote_id, remote_id); subflow->request_join = 1; - subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); + subflow->request_bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); subflow->subflow_id = msk->subflow_id++; mptcp_info2sockaddr(remote, &addr, ssk->sk_family); sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); - if (err && err != -EINPROGRESS) + if (err && err != -EINPROGRESS) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR); + pr_debug("msk=%p local=%d remote=%d connect error: %d\n", + msk, local_id, remote_id, err); goto failed_unlink; + } + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTX); /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); @@ -1677,10 +1774,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ - __netns_tracker_free(net, &sf->sk->ns_tracker, false); - sf->sk->sk_net_refcnt = 1; - get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sf->sk); err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free; @@ -1688,7 +1782,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); release_sock(sf->sk); - /* the newly created socket really belongs to the owning MPTCP master + /* the newly created socket really belongs to the owning MPTCP * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interfaces really show this one belonging to the correct @@ -1699,7 +1793,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; subflow = mptcp_subflow_ctx(sf->sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); *new_sock = sf; sock_hold(sk); @@ -1728,7 +1822,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, INIT_LIST_HEAD(&ctx->node); INIT_LIST_HEAD(&ctx->delegated_node); - pr_debug("subflow=%p", ctx); + pr_debug("subflow=%p\n", ctx); ctx->tcp_sock = sk; WRITE_ONCE(ctx->local_id, -1); @@ -1747,11 +1841,6 @@ static void __subflow_state_change(struct sock *sk) rcu_read_unlock(); } -static bool subflow_is_done(const struct sock *sk) -{ - return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; -} - static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -1778,13 +1867,6 @@ static void subflow_state_change(struct sock *sk) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); - - /* when the fallback subflow closes the rx side, trigger a 'dummy' - * ingress data fin, so that the msk state will follow along - */ - if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) - mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) @@ -1879,7 +1961,7 @@ static int subflow_ulp_init(struct sock *sk) goto out; } - pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); + pr_debug("subflow=%p, family=%d\n", ctx, sk->sk_family); tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; @@ -1954,7 +2036,6 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; - new_ctx->tcp_sock = newsk; if (subflow_req->mp_capable) { /* see comments in subflow_syn_recv_sock(), MPTCP connection @@ -1971,9 +2052,10 @@ static void subflow_ulp_clone(const struct request_sock *req, } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; - new_ctx->fully_established = 1; + WRITE_ONCE(new_ctx->fully_established, true); new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; + new_ctx->request_bkup = subflow_req->request_bkup; WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; |