summaryrefslogtreecommitdiff
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c304
1 files changed, 84 insertions, 220 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e894345d10c1..8bf21996734d 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -11,7 +11,6 @@
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/atomic.h>
-#include <linux/igmp.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
@@ -20,13 +19,15 @@
#include <net/tcp_states.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/transp_v6.h>
-#include <net/addrconf.h>
#endif
#include <net/mptcp.h>
#include <net/xfrm.h>
#include "protocol.h"
#include "mib.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/mptcp.h>
+
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
struct mptcp6_sock {
struct mptcp_sock msk;
@@ -92,16 +93,6 @@ static bool mptcp_is_tcpsk(struct sock *sk)
return false;
}
-static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
-{
- sock_owned_by_me((const struct sock *)msk);
-
- if (likely(!__mptcp_check_fallback(msk)))
- return NULL;
-
- return msk->first;
-}
-
static int __mptcp_socket_create(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
@@ -411,18 +402,6 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
}
-static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
-{
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
- if (subflow->request_join && !subflow->fully_established)
- return false;
-
- /* only send if our side has not closed yet */
- return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
-}
-
static bool tcp_can_send_ack(const struct sock *ssk)
{
return !((1 << inet_sk_state_load(ssk)) &
@@ -742,18 +721,47 @@ wake:
sk->sk_data_ready(sk);
}
-void __mptcp_flush_join_list(struct mptcp_sock *msk)
+static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
+ bool ret = false;
if (likely(list_empty(&msk->join_list)))
- return;
+ return false;
spin_lock_bh(&msk->join_list_lock);
- list_for_each_entry(subflow, &msk->join_list, node)
+ list_for_each_entry(subflow, &msk->join_list, node) {
+ u32 sseq = READ_ONCE(subflow->setsockopt_seq);
+
mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
+ if (READ_ONCE(msk->setsockopt_seq) != sseq)
+ ret = true;
+ }
list_splice_tail_init(&msk->join_list, &msk->conn_list);
spin_unlock_bh(&msk->join_list_lock);
+
+ return ret;
+}
+
+void __mptcp_flush_join_list(struct mptcp_sock *msk)
+{
+ if (likely(!mptcp_do_flush_join_list(msk)))
+ return;
+
+ if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
+ mptcp_schedule_work((struct sock *)msk);
+}
+
+static void mptcp_flush_join_list(struct mptcp_sock *msk)
+{
+ bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags);
+
+ might_sleep();
+
+ if (!mptcp_do_flush_join_list(msk) && !sync_needed)
+ return;
+
+ mptcp_sockopt_sync_all(msk);
}
static bool mptcp_timer_pending(struct sock *sk)
@@ -1277,7 +1285,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
int avail_size;
size_t ret = 0;
- pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d",
+ pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
/* compute send limit */
@@ -1405,6 +1413,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
send_info[i].ratio = -1;
}
mptcp_for_each_subflow(msk, subflow) {
+ trace_mptcp_subflow_get_send(subflow);
ssk = mptcp_subflow_tcp_sock(subflow);
if (!mptcp_subflow_active(subflow))
continue;
@@ -1425,10 +1434,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
}
}
- pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
- msk, nr_active, send_info[0].ssk, send_info[0].ratio,
- send_info[1].ssk, send_info[1].ratio);
-
/* pick the best backup if no other subflow is active */
if (!nr_active)
send_info[0].ssk = send_info[1].ssk;
@@ -1469,7 +1474,7 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
int ret = 0;
prev_ssk = ssk;
- __mptcp_flush_join_list(msk);
+ mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
/* try to keep the subflow socket lock across
@@ -1609,9 +1614,13 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int ret = 0;
long timeo;
- if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
+ /* we don't support FASTOPEN yet */
+ if (msg->msg_flags & MSG_FASTOPEN)
return -EOPNOTSUPP;
+ /* silently ignore everything else */
+ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
+
mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -1695,7 +1704,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!msk->first_pending)
WRITE_ONCE(msk->first_pending, dfrag);
}
- pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk,
+ pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk,
dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
!dfrag_collapsed);
@@ -1734,36 +1743,41 @@ static void mptcp_wait_data(struct sock *sk, long *timeo)
static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
struct msghdr *msg,
- size_t len)
+ size_t len, int flags)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *tmp;
int copied = 0;
- while ((skb = skb_peek(&msk->receive_queue)) != NULL) {
+ skb_queue_walk_safe(&msk->receive_queue, skb, tmp) {
u32 offset = MPTCP_SKB_CB(skb)->offset;
u32 data_len = skb->len - offset;
u32 count = min_t(size_t, len - copied, data_len);
int err;
- err = skb_copy_datagram_msg(skb, offset, msg, count);
- if (unlikely(err < 0)) {
- if (!copied)
- return err;
- break;
+ if (!(flags & MSG_TRUNC)) {
+ err = skb_copy_datagram_msg(skb, offset, msg, count);
+ if (unlikely(err < 0)) {
+ if (!copied)
+ return err;
+ break;
+ }
}
copied += count;
if (count < data_len) {
- MPTCP_SKB_CB(skb)->offset += count;
+ if (!(flags & MSG_PEEK))
+ MPTCP_SKB_CB(skb)->offset += count;
break;
}
- /* we will bulk release the skb memory later */
- skb->destructor = NULL;
- msk->rmem_released += skb->truesize;
- __skb_unlink(skb, &msk->receive_queue);
- __kfree_skb(skb);
+ if (!(flags & MSG_PEEK)) {
+ /* we will bulk release the skb memory later */
+ skb->destructor = NULL;
+ msk->rmem_released += skb->truesize;
+ __skb_unlink(skb, &msk->receive_queue);
+ __kfree_skb(skb);
+ }
if (copied >= len)
break;
@@ -1895,7 +1909,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
unsigned int moved = 0;
bool ret, done;
- __mptcp_flush_join_list(msk);
+ mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
bool slowpath;
@@ -1940,8 +1954,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int target;
long timeo;
- if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
- return -EOPNOTSUPP;
+ /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
if (unlikely(sk->sk_state == TCP_LISTEN)) {
@@ -1957,7 +1972,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
while (copied < len) {
int bytes_read;
- bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
+ bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags);
if (unlikely(bytes_read < 0)) {
if (!copied)
copied = bytes_read;
@@ -2041,7 +2056,8 @@ out_err:
pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
msk, test_bit(MPTCP_DATA_READY, &msk->flags),
skb_queue_empty_lockless(&sk->sk_receive_queue), copied);
- mptcp_rcv_space_adjust(msk, copied);
+ if (!(flags & MSG_PEEK))
+ mptcp_rcv_space_adjust(msk, copied);
release_sock(sk);
return copied;
@@ -2319,7 +2335,7 @@ static void mptcp_worker(struct work_struct *work)
goto unlock;
mptcp_check_data_fin_ack(sk);
- __mptcp_flush_join_list(msk);
+ mptcp_flush_join_list(msk);
mptcp_check_fastclose(msk);
@@ -2382,6 +2398,9 @@ static int __mptcp_init_sock(struct sock *sk)
/* re-use the csk retrans timer for MPTCP-level retrans */
timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);
+
+ tcp_assign_congestion_control(sk);
+
return 0;
}
@@ -2519,7 +2538,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
}
}
- __mptcp_flush_join_list(msk);
+ mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
@@ -2575,6 +2594,8 @@ static void __mptcp_destroy_sock(struct sock *sk)
WARN_ON_ONCE(msk->rmem_released);
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
+
+ tcp_cleanup_congestion_control(sk);
sk_refcnt_debug_release(sk);
mptcp_dispose_initial_subflow(msk);
sock_put(sk);
@@ -2601,7 +2622,7 @@ static void mptcp_close(struct sock *sk, long timeout)
cleanup:
/* orphan all the subflows */
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
- list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) {
+ mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast(ssk);
@@ -2656,7 +2677,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
- __mptcp_flush_join_list(msk);
+ mptcp_do_flush_join_list(msk);
+
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -2705,6 +2727,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->snd_nxt = msk->write_seq;
msk->snd_una = msk->write_seq;
msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
+ msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
if (mp_opt->mp_capable) {
msk->can_ack = true;
@@ -2813,116 +2836,6 @@ static void mptcp_destroy(struct sock *sk)
sk_sockets_allocated_dec(sk);
}
-static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
- sockptr_t optval, unsigned int optlen)
-{
- struct sock *sk = (struct sock *)msk;
- struct socket *ssock;
- int ret;
-
- switch (optname) {
- case SO_REUSEPORT:
- case SO_REUSEADDR:
- lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- release_sock(sk);
- return -EINVAL;
- }
-
- ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
- if (ret == 0) {
- if (optname == SO_REUSEPORT)
- sk->sk_reuseport = ssock->sk->sk_reuseport;
- else if (optname == SO_REUSEADDR)
- sk->sk_reuse = ssock->sk->sk_reuse;
- }
- release_sock(sk);
- return ret;
- }
-
- return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
-}
-
-static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
- sockptr_t optval, unsigned int optlen)
-{
- struct sock *sk = (struct sock *)msk;
- int ret = -EOPNOTSUPP;
- struct socket *ssock;
-
- switch (optname) {
- case IPV6_V6ONLY:
- lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- release_sock(sk);
- return -EINVAL;
- }
-
- ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
- if (ret == 0)
- sk->sk_ipv6only = ssock->sk->sk_ipv6only;
-
- release_sock(sk);
- break;
- }
-
- return ret;
-}
-
-static int mptcp_setsockopt(struct sock *sk, int level, int optname,
- sockptr_t optval, unsigned int optlen)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
- struct sock *ssk;
-
- pr_debug("msk=%p", msk);
-
- if (level == SOL_SOCKET)
- return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
-
- /* @@ the meaning of setsockopt() when the socket is connected and
- * there are multiple subflows is not yet defined. It is up to the
- * MPTCP-level socket to configure the subflows until the subflow
- * is in TCP fallback, when TCP socket options are passed through
- * to the one remaining subflow.
- */
- lock_sock(sk);
- ssk = __mptcp_tcp_fallback(msk);
- release_sock(sk);
- if (ssk)
- return tcp_setsockopt(ssk, level, optname, optval, optlen);
-
- if (level == SOL_IPV6)
- return mptcp_setsockopt_v6(msk, optname, optval, optlen);
-
- return -EOPNOTSUPP;
-}
-
-static int mptcp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *option)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
- struct sock *ssk;
-
- pr_debug("msk=%p", msk);
-
- /* @@ the meaning of setsockopt() when the socket is connected and
- * there are multiple subflows is not yet defined. It is up to the
- * MPTCP-level socket to configure the subflows until the subflow
- * is in TCP fallback, when socket options are passed through
- * to the one remaining subflow.
- */
- lock_sock(sk);
- ssk = __mptcp_tcp_fallback(msk);
- release_sock(sk);
- if (ssk)
- return tcp_getsockopt(ssk, level, optname, optval, option);
-
- return -EOPNOTSUPP;
-}
-
void __mptcp_data_acked(struct sock *sk)
{
if (!sock_owned_by_user(sk))
@@ -3332,7 +3245,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
*/
- __mptcp_flush_join_list(msk);
+ mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -3409,34 +3322,10 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
return mask;
}
-static int mptcp_release(struct socket *sock)
-{
- struct mptcp_subflow_context *subflow;
- struct sock *sk = sock->sk;
- struct mptcp_sock *msk;
-
- if (!sk)
- return 0;
-
- lock_sock(sk);
-
- msk = mptcp_sk(sk);
-
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- ip_mc_drop_socket(ssk);
- }
-
- release_sock(sk);
-
- return inet_release(sock);
-}
-
static const struct proto_ops mptcp_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
- .release = mptcp_release,
+ .release = inet_release,
.bind = mptcp_bind,
.connect = mptcp_stream_connect,
.socketpair = sock_no_socketpair,
@@ -3528,35 +3417,10 @@ void __init mptcp_proto_init(void)
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
-static int mptcp6_release(struct socket *sock)
-{
- struct mptcp_subflow_context *subflow;
- struct mptcp_sock *msk;
- struct sock *sk = sock->sk;
-
- if (!sk)
- return 0;
-
- lock_sock(sk);
-
- msk = mptcp_sk(sk);
-
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- ip_mc_drop_socket(ssk);
- ipv6_sock_mc_close(ssk);
- ipv6_sock_ac_close(ssk);
- }
-
- release_sock(sk);
- return inet6_release(sock);
-}
-
static const struct proto_ops mptcp_v6_stream_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
- .release = mptcp6_release,
+ .release = inet6_release,
.bind = mptcp_bind,
.connect = mptcp_stream_connect,
.socketpair = sock_no_socketpair,