summaryrefslogtreecommitdiff
path: root/net/mptcp/subflow.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp/subflow.c')
-rw-r--r--net/mptcp/subflow.c383
1 files changed, 380 insertions, 3 deletions
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 89b91bc7a831..528351e26371 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -78,6 +78,7 @@ static void subflow_init_req(struct request_sock *req,
subflow_req->mp_capable = 1;
subflow_req->remote_key = rx_opt.mptcp.sndr_key;
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
}
}
@@ -116,6 +117,11 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->remote_key);
mptcp_finish_connect(sk);
subflow->conn_finished = 1;
+
+ if (skb) {
+ pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
+ subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+ }
}
}
@@ -210,6 +216,323 @@ close_child:
static struct inet_connection_sock_af_ops subflow_specific;
+enum mapping_status {
+ MAPPING_OK,
+ MAPPING_INVALID,
+ MAPPING_EMPTY,
+ MAPPING_DATA_FIN
+};
+
+static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
+{
+ if ((u32)seq == (u32)old_seq)
+ return old_seq;
+
+ /* Assume map covers data not mapped yet. */
+ return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
+}
+
+static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
+{
+ WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
+ ssn, subflow->map_subflow_seq, subflow->map_data_len);
+}
+
+static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ unsigned int skb_consumed;
+
+ skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
+ if (WARN_ON_ONCE(skb_consumed >= skb->len))
+ return true;
+
+ return skb->len - skb_consumed <= subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+}
+
+static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
+
+ if (unlikely(before(ssn, subflow->map_subflow_seq))) {
+ /* Mapping covers data later in the subflow stream,
+ * currently unsupported.
+ */
+ warn_bad_map(subflow, ssn);
+ return false;
+ }
+ if (unlikely(!before(ssn, subflow->map_subflow_seq +
+ subflow->map_data_len))) {
+ /* Mapping does covers past subflow data, invalid */
+ warn_bad_map(subflow, ssn + skb->len);
+ return false;
+ }
+ return true;
+}
+
+static enum mapping_status get_mapping_status(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_ext *mpext;
+ struct sk_buff *skb;
+ u16 data_len;
+ u64 map_seq;
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (!skb)
+ return MAPPING_EMPTY;
+
+ mpext = mptcp_get_ext(skb);
+ if (!mpext || !mpext->use_map) {
+ if (!subflow->map_valid && !skb->len) {
+ /* the TCP stack deliver 0 len FIN pkt to the receive
+ * queue, that is the only 0len pkts ever expected here,
+ * and we can admit no mapping only for 0 len pkts
+ */
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ WARN_ONCE(1, "0len seq %d:%d flags %x",
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ TCP_SKB_CB(skb)->tcp_flags);
+ sk_eat_skb(ssk, skb);
+ return MAPPING_EMPTY;
+ }
+
+ if (!subflow->map_valid)
+ return MAPPING_INVALID;
+
+ goto validate_seq;
+ }
+
+ pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
+ mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
+ mpext->data_len, mpext->data_fin);
+
+ data_len = mpext->data_len;
+ if (data_len == 0) {
+ pr_err("Infinite mapping not handled");
+ return MAPPING_INVALID;
+ }
+
+ if (mpext->data_fin == 1) {
+ if (data_len == 1) {
+ pr_debug("DATA_FIN with no payload");
+ if (subflow->map_valid) {
+ /* A DATA_FIN might arrive in a DSS
+ * option before the previous mapping
+ * has been fully consumed. Continue
+ * handling the existing mapping.
+ */
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ return MAPPING_OK;
+ } else {
+ return MAPPING_DATA_FIN;
+ }
+ }
+
+ /* Adjust for DATA_FIN using 1 byte of sequence space */
+ data_len--;
+ }
+
+ if (!mpext->dsn64) {
+ map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
+ mpext->data_seq);
+ pr_debug("expanded seq=%llu", subflow->map_seq);
+ } else {
+ map_seq = mpext->data_seq;
+ }
+
+ if (subflow->map_valid) {
+ /* Allow replacing only with an identical map */
+ if (subflow->map_seq == map_seq &&
+ subflow->map_subflow_seq == mpext->subflow_seq &&
+ subflow->map_data_len == data_len) {
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ return MAPPING_OK;
+ }
+
+ /* If this skb data are fully covered by the current mapping,
+ * the new map would need caching, which is not supported
+ */
+ if (skb_is_fully_mapped(ssk, skb))
+ return MAPPING_INVALID;
+
+ /* will validate the next map after consuming the current one */
+ return MAPPING_OK;
+ }
+
+ subflow->map_seq = map_seq;
+ subflow->map_subflow_seq = mpext->subflow_seq;
+ subflow->map_data_len = data_len;
+ subflow->map_valid = 1;
+ pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
+ subflow->map_seq, subflow->map_subflow_seq,
+ subflow->map_data_len);
+
+validate_seq:
+ /* we revalidate valid mapping on new skb, because we must ensure
+ * the current skb is completely covered by the available mapping
+ */
+ if (!validate_mapping(ssk, skb))
+ return MAPPING_INVALID;
+
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ return MAPPING_OK;
+}
+
+static bool subflow_check_data_avail(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ enum mapping_status status;
+ struct mptcp_sock *msk;
+ struct sk_buff *skb;
+
+ pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
+ subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
+ if (subflow->data_avail)
+ return true;
+
+ if (!subflow->conn)
+ return false;
+
+ msk = mptcp_sk(subflow->conn);
+ for (;;) {
+ u32 map_remaining;
+ size_t delta;
+ u64 ack_seq;
+ u64 old_ack;
+
+ status = get_mapping_status(ssk);
+ pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
+ if (status == MAPPING_INVALID) {
+ ssk->sk_err = EBADMSG;
+ goto fatal;
+ }
+
+ if (status != MAPPING_OK)
+ return false;
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (WARN_ON_ONCE(!skb))
+ return false;
+
+ old_ack = READ_ONCE(msk->ack_seq);
+ ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
+ ack_seq);
+ if (ack_seq == old_ack)
+ break;
+
+ /* only accept in-sequence mapping. Old values are spurious
+ * retransmission; we can hit "future" values on active backup
+ * subflow switch, we relay on retransmissions to get
+ * in-sequence data.
+ * Cuncurrent subflows support will require subflow data
+ * reordering
+ */
+ map_remaining = subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+ if (before64(ack_seq, old_ack))
+ delta = min_t(size_t, old_ack - ack_seq, map_remaining);
+ else
+ delta = min_t(size_t, ack_seq - old_ack, map_remaining);
+
+ /* discard mapped data */
+ pr_debug("discarding %zu bytes, current map len=%d", delta,
+ map_remaining);
+ if (delta) {
+ struct mptcp_read_arg arg = {
+ .msg = NULL,
+ };
+ read_descriptor_t desc = {
+ .count = delta,
+ .arg.data = &arg,
+ };
+ int ret;
+
+ ret = tcp_read_sock(ssk, &desc, mptcp_read_actor);
+ if (ret < 0) {
+ ssk->sk_err = -ret;
+ goto fatal;
+ }
+ if (ret < delta)
+ return false;
+ if (delta == map_remaining)
+ subflow->map_valid = 0;
+ }
+ }
+ return true;
+
+fatal:
+ /* fatal protocol error, close the socket */
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
+ ssk->sk_error_report(ssk);
+ tcp_set_state(ssk, TCP_CLOSE);
+ tcp_send_active_reset(ssk, GFP_ATOMIC);
+ return false;
+}
+
+bool mptcp_subflow_data_available(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sk_buff *skb;
+
+ /* check if current mapping is still valid */
+ if (subflow->map_valid &&
+ mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
+ subflow->map_valid = 0;
+ subflow->data_avail = 0;
+
+ pr_debug("Done with mapping: seq=%u data_len=%u",
+ subflow->map_subflow_seq,
+ subflow->map_data_len);
+ }
+
+ if (!subflow_check_data_avail(sk)) {
+ subflow->data_avail = 0;
+ return false;
+ }
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ subflow->data_avail = skb &&
+ before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
+ return subflow->data_avail;
+}
+
+static void subflow_data_ready(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
+
+ if (!parent || !subflow->mp_capable) {
+ subflow->tcp_data_ready(sk);
+
+ if (parent)
+ parent->sk_data_ready(parent);
+ return;
+ }
+
+ if (mptcp_subflow_data_available(sk)) {
+ set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
+
+ parent->sk_data_ready(parent);
+ }
+}
+
+static void subflow_write_space(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
+
+ sk_stream_write_space(sk);
+ if (parent && sk_stream_is_writeable(sk)) {
+ sk_stream_write_space(parent);
+ }
+}
+
static struct inet_connection_sock_af_ops *
subflow_default_af_ops(struct sock *sk)
{
@@ -296,6 +619,47 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
return ctx;
}
+static void __subflow_state_change(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
+}
+
+static bool subflow_is_done(const struct sock *sk)
+{
+ return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
+}
+
+static void subflow_state_change(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = READ_ONCE(subflow->conn);
+
+ __subflow_state_change(sk);
+
+ /* as recvmsg() does not acquire the subflow socket for ssk selection
+ * a fin packet carrying a DSS can be unnoticed if we don't trigger
+ * the data available machinery here.
+ */
+ if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) {
+ set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
+
+ parent->sk_data_ready(parent);
+ }
+
+ if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) &&
+ !subflow->rx_eof && subflow_is_done(sk)) {
+ subflow->rx_eof = 1;
+ parent->sk_shutdown |= RCV_SHUTDOWN;
+ __subflow_state_change(parent);
+ }
+}
+
static int subflow_ulp_init(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -322,6 +686,12 @@ static int subflow_ulp_init(struct sock *sk)
tp->is_mptcp = 1;
ctx->icsk_af_ops = icsk->icsk_af_ops;
icsk->icsk_af_ops = subflow_default_af_ops(sk);
+ ctx->tcp_data_ready = sk->sk_data_ready;
+ ctx->tcp_state_change = sk->sk_state_change;
+ ctx->tcp_write_space = sk->sk_write_space;
+ sk->sk_data_ready = subflow_data_ready;
+ sk->sk_write_space = subflow_write_space;
+ sk->sk_state_change = subflow_state_change;
out:
return err;
}
@@ -339,10 +709,12 @@ static void subflow_ulp_release(struct sock *sk)
kfree_rcu(ctx, rcu);
}
-static void subflow_ulp_fallback(struct sock *sk)
+static void subflow_ulp_fallback(struct sock *sk,
+ struct mptcp_subflow_context *old_ctx)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ mptcp_subflow_tcp_fallback(sk, old_ctx);
icsk->icsk_ulp_ops = NULL;
rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
tcp_sk(sk)->is_mptcp = 0;
@@ -357,23 +729,28 @@ static void subflow_ulp_clone(const struct request_sock *req,
struct mptcp_subflow_context *new_ctx;
if (!subflow_req->mp_capable) {
- subflow_ulp_fallback(newsk);
+ subflow_ulp_fallback(newsk, old_ctx);
return;
}
new_ctx = subflow_create_ctx(newsk, priority);
if (new_ctx == NULL) {
- subflow_ulp_fallback(newsk);
+ subflow_ulp_fallback(newsk, old_ctx);
return;
}
new_ctx->conn_finished = 1;
new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
+ new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
+ new_ctx->tcp_state_change = old_ctx->tcp_state_change;
+ new_ctx->tcp_write_space = old_ctx->tcp_write_space;
new_ctx->mp_capable = 1;
new_ctx->fourth_ack = 1;
new_ctx->remote_key = subflow_req->remote_key;
new_ctx->local_key = subflow_req->local_key;
new_ctx->token = subflow_req->token;
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->idsn = subflow_req->idsn;
}
static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {