diff options
Diffstat (limited to 'net/mptcp')
| -rw-r--r-- | net/mptcp/mib.c | 1 | ||||
| -rw-r--r-- | net/mptcp/mib.h | 1 | ||||
| -rw-r--r-- | net/mptcp/protocol.c | 83 | ||||
| -rw-r--r-- | net/mptcp/protocol.h | 2 | 
4 files changed, 56 insertions, 31 deletions
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 6003e47c770a..171643815076 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -85,6 +85,7 @@ static const struct snmp_mib mptcp_snmp_list[] = {  	SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),  	SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK),  	SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), +	SNMP_MIB_ITEM("WinProbe", MPTCP_MIB_WINPROBE),  };  /* mptcp_mib_alloc - allocate percpu mib counters diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 309bac6fea32..a1d3e9369fbb 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -88,6 +88,7 @@ enum linux_mptcp_mib_field {  	MPTCP_MIB_DSSFALLBACK,		/* Bad or missing DSS */  	MPTCP_MIB_SIMULTCONNFALLBACK,	/* Simultaneous connect */  	MPTCP_MIB_FALLBACKFAILED,	/* Can't fallback due to msk status */ +	MPTCP_MIB_WINPROBE,		/* MPTCP-level zero window probe */  	__MPTCP_MIB_MAX  }; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0292162a14ee..2d6b8de35c44 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -194,17 +194,26 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,   * - mptcp does not maintain a msk-level window clamp   * - returns true when  the receive buffer is actually updated   */ -static bool mptcp_rcvbuf_grow(struct sock *sk) +static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval)  {  	struct mptcp_sock *msk = mptcp_sk(sk);  	const struct net *net = sock_net(sk); -	int rcvwin, rcvbuf, cap; +	u32 rcvwin, rcvbuf, cap, oldval; +	u64 grow; +	oldval = msk->rcvq_space.space; +	msk->rcvq_space.space = newval;  	if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||  	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK))  		return false; -	rcvwin = msk->rcvq_space.space << 1; +	/* DRS is always one RTT late. */ +	rcvwin = newval << 1; + +	/* slow start: allow the sender to double its rate. */ +	grow = (u64)rcvwin * (newval - oldval); +	do_div(grow, oldval); +	rcvwin += grow << 1;  	if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))  		rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; @@ -334,7 +343,7 @@ end:  	skb_set_owner_r(skb, sk);  	/* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */  	if (sk->sk_socket) -		mptcp_rcvbuf_grow(sk); +		mptcp_rcvbuf_grow(sk, msk->rcvq_space.space);  }  static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, @@ -998,7 +1007,7 @@ static void __mptcp_clean_una(struct sock *sk)  			if (WARN_ON_ONCE(!msk->recovery))  				break; -			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); +			msk->first_pending = mptcp_send_next(sk);  		}  		dfrag_clear(sk, dfrag); @@ -1290,7 +1299,12 @@ alloc_skb:  	if (copy == 0) {  		u64 snd_una = READ_ONCE(msk->snd_una); -		if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) { +		/* No need for zero probe if there are any data pending +		 * either at the msk or ssk level; skb is the current write +		 * queue tail and can be empty at this point. +		 */ +		if (snd_una != msk->snd_nxt || skb->len || +		    skb != tcp_send_head(ssk)) {  			tcp_remove_empty_skb(ssk);  			return 0;  		} @@ -1341,6 +1355,7 @@ alloc_skb:  		 mpext->dsn64);  	if (zero_window_probe) { +		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_WINPROBE);  		mptcp_subflow_ctx(ssk)->rel_write_seq += copy;  		mpext->frozen = 1;  		if (READ_ONCE(msk->csum_enabled)) @@ -1543,7 +1558,7 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk,  			mptcp_update_post_push(msk, dfrag, ret);  		} -		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); +		msk->first_pending = mptcp_send_next(sk);  		if (msk->snd_burst <= 0 ||  		    !sk_stream_memory_free(ssk) || @@ -1903,7 +1918,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)  			get_page(dfrag->page);  			list_add_tail(&dfrag->list, &msk->rtx_queue);  			if (!msk->first_pending) -				WRITE_ONCE(msk->first_pending, dfrag); +				msk->first_pending = dfrag;  		}  		pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk,  			 dfrag->data_seq, dfrag->data_len, dfrag->already_sent, @@ -1936,22 +1951,36 @@ do_error:  static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); -static int __mptcp_recvmsg_mskq(struct sock *sk, -				struct msghdr *msg, -				size_t len, int flags, +static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, +				size_t len, int flags, int copied_total,  				struct scm_timestamping_internal *tss,  				int *cmsg_flags)  {  	struct mptcp_sock *msk = mptcp_sk(sk);  	struct sk_buff *skb, *tmp; +	int total_data_len = 0;  	int copied = 0;  	skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { -		u32 offset = MPTCP_SKB_CB(skb)->offset; +		u32 delta, offset = MPTCP_SKB_CB(skb)->offset;  		u32 data_len = skb->len - offset; -		u32 count = min_t(size_t, len - copied, data_len); +		u32 count;  		int err; +		if (flags & MSG_PEEK) { +			/* skip already peeked skbs */ +			if (total_data_len + data_len <= copied_total) { +				total_data_len += data_len; +				continue; +			} + +			/* skip the already peeked data in the current skb */ +			delta = copied_total - total_data_len; +			offset += delta; +			data_len -= delta; +		} + +		count = min_t(size_t, len - copied, data_len);  		if (!(flags & MSG_TRUNC)) {  			err = skb_copy_datagram_msg(skb, offset, msg, count);  			if (unlikely(err < 0)) { @@ -1968,16 +1997,14 @@ static int __mptcp_recvmsg_mskq(struct sock *sk,  		copied += count; -		if (count < data_len) { -			if (!(flags & MSG_PEEK)) { +		if (!(flags & MSG_PEEK)) { +			msk->bytes_consumed += count; +			if (count < data_len) {  				MPTCP_SKB_CB(skb)->offset += count;  				MPTCP_SKB_CB(skb)->map_seq += count; -				msk->bytes_consumed += count; +				break;  			} -			break; -		} -		if (!(flags & MSG_PEEK)) {  			/* avoid the indirect call, we know the destructor is sock_rfree */  			skb->destructor = NULL;  			skb->sk = NULL; @@ -1985,7 +2012,6 @@ static int __mptcp_recvmsg_mskq(struct sock *sk,  			sk_mem_uncharge(sk, skb->truesize);  			__skb_unlink(skb, &sk->sk_receive_queue);  			skb_attempt_defer_free(skb); -			msk->bytes_consumed += count;  		}  		if (copied >= len) @@ -2049,9 +2075,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)  	if (msk->rcvq_space.copied <= msk->rcvq_space.space)  		goto new_measure; -	msk->rcvq_space.space = msk->rcvq_space.copied; -	if (mptcp_rcvbuf_grow(sk)) { - +	if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {  		/* Make subflows follow along.  If we do not do this, we  		 * get drops at subflow level if skbs can't be moved to  		 * the mptcp rx queue fast enough (announced rcv_win can @@ -2063,8 +2087,9 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)  			ssk = mptcp_subflow_tcp_sock(subflow);  			slow = lock_sock_fast(ssk); -			tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied; -			tcp_rcvbuf_grow(ssk); +			/* subflows can be added before tcp_init_transfer() */ +			if (tcp_sk(ssk)->rcvq_space.space) +				tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied);  			unlock_sock_fast(ssk, slow);  		}  	} @@ -2183,7 +2208,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,  	while (copied < len) {  		int err, bytes_read; -		bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); +		bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, +						  copied, &tss, &cmsg_flags);  		if (unlikely(bytes_read < 0)) {  			if (!copied)  				copied = bytes_read; @@ -2874,7 +2900,7 @@ static void __mptcp_clear_xmit(struct sock *sk)  	struct mptcp_sock *msk = mptcp_sk(sk);  	struct mptcp_data_frag *dtmp, *dfrag; -	WRITE_ONCE(msk->first_pending, NULL); +	msk->first_pending = NULL;  	list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)  		dfrag_clear(sk, dfrag);  } @@ -3414,9 +3440,6 @@ void __mptcp_data_acked(struct sock *sk)  void __mptcp_check_push(struct sock *sk, struct sock *ssk)  { -	if (!mptcp_send_head(sk)) -		return; -  	if (!sock_owned_by_user(sk))  		__mptcp_subflow_push_pending(sk, ssk, false);  	else diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 52f9cfa4ce95..379a88e14e8d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -414,7 +414,7 @@ static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)  {  	const struct mptcp_sock *msk = mptcp_sk(sk); -	return READ_ONCE(msk->first_pending); +	return msk->first_pending;  }  static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk)  | 
