summaryrefslogtreecommitdiff
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/Makefile3
-rw-r--r--net/mptcp/crypto.c17
-rw-r--r--net/mptcp/diag.c104
-rw-r--r--net/mptcp/mib.c69
-rw-r--r--net/mptcp/mib.h40
-rw-r--r--net/mptcp/options.c553
-rw-r--r--net/mptcp/pm.c242
-rw-r--r--net/mptcp/pm_netlink.c857
-rw-r--r--net/mptcp/protocol.c1056
-rw-r--r--net/mptcp/protocol.h204
-rw-r--r--net/mptcp/subflow.c403
-rw-r--r--net/mptcp/token.c38
12 files changed, 3258 insertions, 328 deletions
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index 4e98d9edfd0a..baa0640527c7 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_MPTCP) += mptcp.o
-mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o
+mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
+ mib.o pm_netlink.o
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index 40d1bb18fd60..c151628bd416 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -44,8 +44,7 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
*idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6]));
}
-void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
- void *hmac)
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
{
u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
__be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
@@ -55,6 +54,9 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
u8 key2be[8];
int i;
+ if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE))
+ len = SHA256_DIGEST_SIZE;
+
put_unaligned_be64(key1, key1be);
put_unaligned_be64(key2, key2be);
@@ -65,11 +67,10 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
for (i = 0; i < 8; i++)
input[i + 8] ^= key2be[i];
- put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]);
- put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]);
+ memcpy(&input[SHA256_BLOCK_SIZE], msg, len);
sha256_init(&state);
- sha256_update(&state, input, SHA256_BLOCK_SIZE + 8);
+ sha256_update(&state, input, SHA256_BLOCK_SIZE + len);
/* emit sha256(K1 || msg) on the second input block, so we can
* reuse 'input' for the last hashing
@@ -125,6 +126,7 @@ static int __init test_mptcp_crypto(void)
char hmac[20], hmac_hex[41];
u32 nonce1, nonce2;
u64 key1, key2;
+ u8 msg[8];
int i, j;
for (i = 0; i < ARRAY_SIZE(tests); ++i) {
@@ -134,7 +136,10 @@ static int __init test_mptcp_crypto(void)
nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
- mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac);
+ put_unaligned_be32(nonce1, &msg[0]);
+ put_unaligned_be32(nonce2, &msg[4]);
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
for (j = 0; j < 20; ++j)
sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
hmac_hex[40] = 0;
diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
new file mode 100644
index 000000000000..a536586742f2
--- /dev/null
+++ b/net/mptcp/diag.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/* MPTCP socket monitoring support
+ *
+ * Copyright (c) 2019 Red Hat
+ *
+ * Author: Davide Caratti <dcaratti@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/inet_diag.h>
+#include <net/netlink.h>
+#include <uapi/linux/mptcp.h>
+#include "protocol.h"
+
+static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *sf;
+ struct nlattr *start;
+ u32 flags = 0;
+ int err;
+
+ start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
+ if (!start)
+ return -EMSGSIZE;
+
+ rcu_read_lock();
+ sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data);
+ if (!sf) {
+ err = 0;
+ goto nla_failure;
+ }
+
+ if (sf->mp_capable)
+ flags |= MPTCP_SUBFLOW_FLAG_MCAP_REM;
+ if (sf->request_mptcp)
+ flags |= MPTCP_SUBFLOW_FLAG_MCAP_LOC;
+ if (sf->mp_join)
+ flags |= MPTCP_SUBFLOW_FLAG_JOIN_REM;
+ if (sf->request_join)
+ flags |= MPTCP_SUBFLOW_FLAG_JOIN_LOC;
+ if (sf->backup)
+ flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM;
+ if (sf->request_bkup)
+ flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC;
+ if (sf->fully_established)
+ flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED;
+ if (sf->conn_finished)
+ flags |= MPTCP_SUBFLOW_FLAG_CONNECTED;
+ if (sf->map_valid)
+ flags |= MPTCP_SUBFLOW_FLAG_MAPVALID;
+
+ if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
+ sf->rel_write_seq) ||
+ nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq,
+ MPTCP_SUBFLOW_ATTR_PAD) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
+ sf->map_subflow_seq) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) ||
+ nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
+ sf->map_data_len) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) ||
+ nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) ||
+ nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) {
+ err = -EMSGSIZE;
+ goto nla_failure;
+ }
+
+ rcu_read_unlock();
+ nla_nest_end(skb, start);
+ return 0;
+
+nla_failure:
+ rcu_read_unlock();
+ nla_nest_cancel(skb, start);
+ return err;
+}
+
+static size_t subflow_get_info_size(const struct sock *sk)
+{
+ size_t size = 0;
+
+ size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */
+ nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */
+ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
+ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */
+ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */
+ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */
+ 0;
+ return size;
+}
+
+void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops)
+{
+ ops->get_info = subflow_get_info;
+ ops->get_info_size = subflow_get_info_size;
+}
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
new file mode 100644
index 000000000000..0a6a15f3456d
--- /dev/null
+++ b/net/mptcp/mib.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/mptcp.h>
+#include <net/snmp.h>
+#include <net/net_namespace.h>
+
+#include "mib.h"
+
+static const struct snmp_mib mptcp_snmp_list[] = {
+ SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE),
+ SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
+ SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
+ SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
+ SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
+ SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
+ SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX),
+ SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX),
+ SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC),
+ SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
+ SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
+ SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
+ SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
+ SNMP_MIB_SENTINEL
+};
+
+/* mptcp_mib_alloc - allocate percpu mib counters
+ *
+ * These are allocated when the first mptcp socket is created so
+ * we do not waste percpu memory if mptcp isn't in use.
+ */
+bool mptcp_mib_alloc(struct net *net)
+{
+ struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib);
+
+ if (!mib)
+ return false;
+
+ if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib))
+ free_percpu(mib);
+
+ return true;
+}
+
+void mptcp_seq_show(struct seq_file *seq)
+{
+ struct net *net = seq->private;
+ int i;
+
+ seq_puts(seq, "MPTcpExt:");
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_printf(seq, " %s", mptcp_snmp_list[i].name);
+
+ seq_puts(seq, "\nMPTcpExt:");
+
+ if (!net->mib.mptcp_statistics) {
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_puts(seq, " 0");
+
+ return;
+ }
+
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.mptcp_statistics,
+ mptcp_snmp_list[i].entry));
+ seq_putc(seq, '\n');
+}
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
new file mode 100644
index 000000000000..d7de340fc997
--- /dev/null
+++ b/net/mptcp/mib.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+enum linux_mptcp_mib_field {
+ MPTCP_MIB_NUM = 0,
+ MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
+ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */
+ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */
+ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */
+ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */
+ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
+ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
+ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
+ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
+ __MPTCP_MIB_MAX
+};
+
+#define LINUX_MIB_MPTCP_MAX __MPTCP_MIB_MAX
+struct mptcp_mib {
+ unsigned long mibs[LINUX_MIB_MPTCP_MAX];
+};
+
+static inline void MPTCP_INC_STATS(struct net *net,
+ enum linux_mptcp_mib_field field)
+{
+ if (likely(net->mib.mptcp_statistics))
+ SNMP_INC_STATS(net->mib.mptcp_statistics, field);
+}
+
+static inline void __MPTCP_INC_STATS(struct net *net,
+ enum linux_mptcp_mib_field field)
+{
+ if (likely(net->mib.mptcp_statistics))
+ __SNMP_INC_STATS(net->mib.mptcp_statistics, field);
+}
+
+bool mptcp_mib_alloc(struct net *net);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index fd2c3150e591..bd220ee4aac9 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -96,6 +96,38 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
mp_opt->rcvr_key, mp_opt->data_len);
break;
+ case MPTCPOPT_MP_JOIN:
+ mp_opt->mp_join = 1;
+ if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
+ mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
+ mp_opt->join_id = *ptr++;
+ mp_opt->token = get_unaligned_be32(ptr);
+ ptr += 4;
+ mp_opt->nonce = get_unaligned_be32(ptr);
+ ptr += 4;
+ pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
+ mp_opt->backup, mp_opt->join_id,
+ mp_opt->token, mp_opt->nonce);
+ } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
+ mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
+ mp_opt->join_id = *ptr++;
+ mp_opt->thmac = get_unaligned_be64(ptr);
+ ptr += 8;
+ mp_opt->nonce = get_unaligned_be32(ptr);
+ ptr += 4;
+ pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ mp_opt->backup, mp_opt->join_id,
+ mp_opt->thmac, mp_opt->nonce);
+ } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
+ ptr += 2;
+ memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
+ pr_debug("MP_JOIN hmac");
+ } else {
+ pr_warn("MP_JOIN bad option size");
+ mp_opt->mp_join = 0;
+ }
+ break;
+
case MPTCPOPT_DSS:
pr_debug("DSS");
ptr++;
@@ -178,6 +210,71 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
break;
+ case MPTCPOPT_ADD_ADDR:
+ mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
+ if (!mp_opt->echo) {
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_4;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_6;
+#endif
+ else
+ break;
+ } else {
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_4;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_6;
+#endif
+ else
+ break;
+ }
+
+ mp_opt->add_addr = 1;
+ mp_opt->port = 0;
+ mp_opt->addr_id = *ptr++;
+ pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id);
+ if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
+ memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
+ ptr += 4;
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
+ mp_opt->port = get_unaligned_be16(ptr);
+ ptr += 2;
+ }
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else {
+ memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16);
+ ptr += 16;
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
+ mp_opt->port = get_unaligned_be16(ptr);
+ ptr += 2;
+ }
+ }
+#endif
+ if (!mp_opt->echo) {
+ mp_opt->ahmac = get_unaligned_be64(ptr);
+ ptr += 8;
+ }
+ break;
+
+ case MPTCPOPT_RM_ADDR:
+ if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
+ break;
+
+ mp_opt->rm_addr = 1;
+ mp_opt->rm_id = *ptr++;
+ pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
+ break;
+
default:
break;
}
@@ -231,6 +328,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
opts->sndr_key = subflow->local_key;
*size = TCPOLEN_MPTCP_MPC_SYN;
return true;
+ } else if (subflow->request_join) {
+ pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
+ subflow->local_nonce);
+ opts->suboptions = OPTION_MPTCP_MPJ_SYN;
+ opts->join_id = subflow->local_id;
+ opts->token = subflow->remote_token;
+ opts->nonce = subflow->local_nonce;
+ opts->backup = subflow->request_bkup;
+ *size = TCPOLEN_MPTCP_MPJ_SYN;
+ return true;
}
return false;
}
@@ -240,16 +347,55 @@ void mptcp_rcv_synsent(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct tcp_sock *tp = tcp_sk(sk);
- pr_debug("subflow=%p", subflow);
if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
subflow->mp_capable = 1;
subflow->can_ack = 1;
subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
- } else {
+ pr_debug("subflow=%p, remote_key=%llu", subflow,
+ subflow->remote_key);
+ } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) {
+ subflow->mp_join = 1;
+ subflow->thmac = tp->rx_opt.mptcp.thmac;
+ subflow->remote_nonce = tp->rx_opt.mptcp.nonce;
+ pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
+ subflow->thmac, subflow->remote_nonce);
+ } else if (subflow->request_mptcp) {
tcp_sk(sk)->is_mptcp = 0;
}
}
+/* MP_JOIN client subflow must wait for 4th ack before sending any data:
+ * TCP can't schedule delack timer before the subflow is fully established.
+ * MPTCP uses the delack timer to do 3rd ack retransmissions
+ */
+static void schedule_3rdack_retransmission(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
+
+ /* reschedule with a timeout above RTT, as we must look only for drop */
+ if (tp->srtt_us)
+ timeout = tp->srtt_us << 1;
+ else
+ timeout = TCP_TIMEOUT_INIT;
+
+ WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+static void clear_3rdack_retransmission(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_delack_timer);
+ icsk->icsk_ack.timeout = 0;
+ icsk->icsk_ack.ato = 0;
+ icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
+}
+
static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
unsigned int *size,
unsigned int remaining,
@@ -259,17 +405,21 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
struct mptcp_ext *mpext;
unsigned int data_len;
- pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow,
- subflow->fourth_ack, subflow->snd_isn,
- skb ? TCP_SKB_CB(skb)->seq : 0, remaining);
+ /* When skb is not available, we better over-estimate the emitted
+ * options len. A full DSS option (28 bytes) is longer than
+ * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
+ * tell the caller to defer the estimate to
+ * mptcp_established_options_dss(), which will reserve enough space.
+ */
+ if (!skb)
+ return false;
- if (subflow->mp_capable && !subflow->fourth_ack && skb &&
- subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
- /* When skb is not available, we better over-estimate the
- * emitted options len. A full DSS option is longer than
- * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
- * that.
- */
+ /* MPC/MPJ needed only on 3rd ack packet */
+ if (subflow->fully_established ||
+ subflow->snd_isn != TCP_SKB_CB(skb)->seq)
+ return false;
+
+ if (subflow->mp_capable) {
mpext = mptcp_get_ext(skb);
data_len = mpext ? mpext->data_len : 0;
@@ -297,6 +447,14 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
data_len);
return true;
+ } else if (subflow->mp_join) {
+ opts->suboptions = OPTION_MPTCP_MPJ_ACK;
+ memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
+ *size = TCPOLEN_MPTCP_MPJ_ACK;
+ pr_debug("subflow=%p", subflow);
+
+ schedule_3rdack_retransmission(sk);
+ return true;
}
return false;
}
@@ -304,21 +462,22 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
struct mptcp_ext *ext)
{
- ext->data_fin = 1;
-
if (!ext->use_map) {
/* RFC6824 requires a DSS mapping with specific values
* if DATA_FIN is set but no data payload is mapped
*/
+ ext->data_fin = 1;
ext->use_map = 1;
ext->dsn64 = 1;
- ext->data_seq = mptcp_sk(subflow->conn)->write_seq;
+ ext->data_seq = subflow->data_fin_tx_seq;
ext->subflow_seq = 0;
ext->data_len = 1;
- } else {
- /* If there's an existing DSS mapping, DATA_FIN consumes
- * 1 additional byte of mapping space.
+ } else if (ext->data_seq + ext->data_len == subflow->data_fin_tx_seq) {
+ /* If there's an existing DSS mapping and it is the
+ * final mapping, DATA_FIN consumes 1 additional byte of
+ * mapping space.
*/
+ ext->data_fin = 1;
ext->data_len++;
}
}
@@ -334,8 +493,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
struct mptcp_sock *msk;
unsigned int ack_size;
bool ret = false;
- bool can_ack;
- u64 ack_seq;
u8 tcp_fin;
if (skb) {
@@ -356,8 +513,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
if (mpext)
opts->ext_copy = *mpext;
- if (skb && tcp_fin &&
- subflow->conn->sk_state != TCP_ESTABLISHED)
+ if (skb && tcp_fin && subflow->data_fin_tx_enable)
mptcp_write_data_fin(subflow, &opts->ext_copy);
ret = true;
}
@@ -365,19 +521,9 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
/* passive sockets msk will set the 'can_ack' after accept(), even
* if the first subflow may have the already the remote key handy
*/
- can_ack = true;
opts->ext_copy.use_ack = 0;
msk = mptcp_sk(subflow->conn);
- if (likely(msk && READ_ONCE(msk->can_ack))) {
- ack_seq = msk->ack_seq;
- } else if (subflow->can_ack) {
- mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
- ack_seq++;
- } else {
- can_ack = false;
- }
-
- if (unlikely(!can_ack)) {
+ if (!READ_ONCE(msk->can_ack)) {
*size = ALIGN(dss_size, 4);
return ret;
}
@@ -390,7 +536,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
dss_size += ack_size;
- opts->ext_copy.data_ack = ack_seq;
+ opts->ext_copy.data_ack = msk->ack_seq;
opts->ext_copy.ack64 = 1;
opts->ext_copy.use_ack = 1;
@@ -398,6 +544,83 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
return true;
}
+static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
+ struct in_addr *addr)
+{
+ u8 hmac[MPTCP_ADDR_HMAC_LEN];
+ u8 msg[7];
+
+ msg[0] = addr_id;
+ memcpy(&msg[1], &addr->s_addr, 4);
+ msg[5] = 0;
+ msg[6] = 0;
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
+
+ return get_unaligned_be64(hmac);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
+ struct in6_addr *addr)
+{
+ u8 hmac[MPTCP_ADDR_HMAC_LEN];
+ u8 msg[19];
+
+ msg[0] = addr_id;
+ memcpy(&msg[1], &addr->s6_addr, 16);
+ msg[17] = 0;
+ msg[18] = 0;
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
+
+ return get_unaligned_be64(hmac);
+}
+#endif
+
+static bool mptcp_established_options_addr(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct mptcp_addr_info saddr;
+ int len;
+
+ if (!mptcp_pm_should_signal(msk) ||
+ !(mptcp_pm_addr_signal(msk, remaining, &saddr)))
+ return false;
+
+ len = mptcp_add_addr_len(saddr.family);
+ if (remaining < len)
+ return false;
+
+ *size = len;
+ opts->addr_id = saddr.id;
+ if (saddr.family == AF_INET) {
+ opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
+ opts->addr = saddr.addr;
+ opts->ahmac = add_addr_generate_hmac(msk->local_key,
+ msk->remote_key,
+ opts->addr_id,
+ &opts->addr);
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (saddr.family == AF_INET6) {
+ opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
+ opts->addr6 = saddr.addr6;
+ opts->ahmac = add_addr6_generate_hmac(msk->local_key,
+ msk->remote_key,
+ opts->addr_id,
+ &opts->addr6);
+ }
+#endif
+ pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac);
+
+ return true;
+}
+
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts)
@@ -405,6 +628,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int opt_size = 0;
bool ret = false;
+ opts->suboptions = 0;
+
if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -419,6 +644,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
*size += opt_size;
remaining -= opt_size;
+ if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ ret = true;
+ }
return ret;
}
@@ -435,54 +665,194 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
pr_debug("subflow_req=%p, local_key=%llu",
subflow_req, subflow_req->local_key);
return true;
+ } else if (subflow_req->mp_join) {
+ opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
+ opts->backup = subflow_req->backup;
+ opts->join_id = subflow_req->local_id;
+ opts->thmac = subflow_req->thmac;
+ opts->nonce = subflow_req->local_nonce;
+ pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ subflow_req, opts->backup, opts->join_id,
+ opts->thmac, opts->nonce);
+ *size = TCPOLEN_MPTCP_MPJ_SYNACK;
+ return true;
}
return false;
}
-static bool check_fourth_ack(struct mptcp_subflow_context *subflow,
- struct sk_buff *skb,
- struct mptcp_options_received *mp_opt)
+static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
+ struct mptcp_subflow_context *subflow,
+ struct sk_buff *skb,
+ struct mptcp_options_received *mp_opt)
{
/* here we can process OoO, in-window pkts, only in-sequence 4th ack
- * are relevant
+ * will make the subflow fully established
*/
- if (likely(subflow->fourth_ack ||
- TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1))
- return true;
+ if (likely(subflow->fully_established)) {
+ /* on passive sockets, check for 3rd ack retransmission
+ * note that msk is always set by subflow_syn_recv_sock()
+ * for mp_join subflows
+ */
+ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
+ TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
+ subflow->mp_join && mp_opt->mp_join &&
+ READ_ONCE(msk->pm.server_side))
+ tcp_send_ack(sk);
+ goto fully_established;
+ }
- if (mp_opt->use_ack)
- subflow->fourth_ack = 1;
+ /* we should process OoO packets before the first subflow is fully
+ * established, but not expected for MP_JOIN subflows
+ */
+ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)
+ return subflow->mp_capable;
- if (subflow->can_ack)
- return true;
+ if (mp_opt->use_ack) {
+ /* subflows are fully established as soon as we get any
+ * additional ack.
+ */
+ subflow->fully_established = 1;
+ goto fully_established;
+ }
+
+ WARN_ON_ONCE(subflow->can_ack);
/* If the first established packet does not contain MP_CAPABLE + data
* then fallback to TCP
*/
if (!mp_opt->mp_capable) {
subflow->mp_capable = 0;
- tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0;
+ tcp_sk(sk)->is_mptcp = 0;
return false;
}
+
+ subflow->fully_established = 1;
subflow->remote_key = mp_opt->sndr_key;
subflow->can_ack = 1;
+
+fully_established:
+ if (likely(subflow->pm_notified))
+ return true;
+
+ subflow->pm_notified = 1;
+ if (subflow->mp_join) {
+ clear_3rdack_retransmission(sk);
+ mptcp_pm_subflow_established(msk, subflow);
+ } else {
+ mptcp_pm_fully_established(msk);
+ }
return true;
}
+static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
+{
+ u32 old_ack32, cur_ack32;
+
+ if (use_64bit)
+ return cur_ack;
+
+ old_ack32 = (u32)old_ack;
+ cur_ack32 = (u32)cur_ack;
+ cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32;
+ if (unlikely(before(cur_ack32, old_ack32)))
+ return cur_ack + (1LL << 32);
+ return cur_ack;
+}
+
+static void update_una(struct mptcp_sock *msk,
+ struct mptcp_options_received *mp_opt)
+{
+ u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una);
+ u64 write_seq = READ_ONCE(msk->write_seq);
+
+ /* avoid ack expansion on update conflict, to reduce the risk of
+ * wrongly expanding to a future ack sequence number, which is way
+ * more dangerous than missing an ack
+ */
+ new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
+
+ /* ACK for data not even sent yet? Ignore. */
+ if (after64(new_snd_una, write_seq))
+ new_snd_una = old_snd_una;
+
+ while (after64(new_snd_una, old_snd_una)) {
+ snd_una = old_snd_una;
+ old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una,
+ new_snd_una);
+ if (old_snd_una == snd_una) {
+ mptcp_data_acked((struct sock *)msk);
+ break;
+ }
+ }
+}
+
+static bool add_addr_hmac_valid(struct mptcp_sock *msk,
+ struct mptcp_options_received *mp_opt)
+{
+ u64 hmac = 0;
+
+ if (mp_opt->echo)
+ return true;
+
+ if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
+ hmac = add_addr_generate_hmac(msk->remote_key,
+ msk->local_key,
+ mp_opt->addr_id, &mp_opt->addr);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else
+ hmac = add_addr6_generate_hmac(msk->remote_key,
+ msk->local_key,
+ mp_opt->addr_id, &mp_opt->addr6);
+#endif
+
+ pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
+ msk, (unsigned long long)hmac,
+ (unsigned long long)mp_opt->ahmac);
+
+ return hmac == mp_opt->ahmac;
+}
+
void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
struct tcp_options_received *opt_rx)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_options_received *mp_opt;
struct mptcp_ext *mpext;
mp_opt = &opt_rx->mptcp;
- if (!check_fourth_ack(subflow, skb, mp_opt))
+ if (!check_fully_established(msk, sk, subflow, skb, mp_opt))
return;
+ if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) {
+ struct mptcp_addr_info addr;
+
+ addr.port = htons(mp_opt->port);
+ addr.id = mp_opt->addr_id;
+ if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
+ addr.family = AF_INET;
+ addr.addr = mp_opt->addr;
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) {
+ addr.family = AF_INET6;
+ addr.addr6 = mp_opt->addr6;
+ }
+#endif
+ if (!mp_opt->echo)
+ mptcp_pm_add_addr_received(msk, &addr);
+ mp_opt->add_addr = 0;
+ }
+
if (!mp_opt->dss)
return;
+ /* we can't wait for recvmsg() to update the ack_seq, otherwise
+ * monodirectional flows will stuck
+ */
+ if (mp_opt->use_ack)
+ update_una(msk, mp_opt);
+
mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
if (!mpext)
return;
@@ -509,12 +879,6 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
mpext->use_map = 1;
}
- if (mp_opt->use_ack) {
- mpext->data_ack = mp_opt->data_ack;
- mpext->use_ack = 1;
- mpext->ack64 = mp_opt->ack64;
- }
-
mpext->data_fin = mp_opt->data_fin;
}
@@ -533,10 +897,9 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
else
len = TCPOLEN_MPTCP_MPC_ACK;
- *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) |
- (MPTCPOPT_MP_CAPABLE << 12) |
- (MPTCP_SUPPORTED_VERSION << 8) |
- MPTCP_CAP_HMAC_SHA256);
+ *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
+ MPTCP_SUPPORTED_VERSION,
+ MPTCP_CAP_HMAC_SHA256);
if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
opts->suboptions))
@@ -558,6 +921,77 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
}
mp_capable_done:
+ if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
+ if (opts->ahmac)
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR, 0,
+ opts->addr_id);
+ else
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR_BASE,
+ MPTCP_ADDR_ECHO,
+ opts->addr_id);
+ memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
+ ptr += 1;
+ if (opts->ahmac) {
+ put_unaligned_be64(opts->ahmac, ptr);
+ ptr += 2;
+ }
+ }
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
+ if (opts->ahmac)
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR6, 0,
+ opts->addr_id);
+ else
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR6_BASE,
+ MPTCP_ADDR_ECHO,
+ opts->addr_id);
+ memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
+ ptr += 4;
+ if (opts->ahmac) {
+ put_unaligned_be64(opts->ahmac, ptr);
+ ptr += 2;
+ }
+ }
+#endif
+
+ if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
+ TCPOLEN_MPTCP_RM_ADDR_BASE,
+ 0, opts->rm_id);
+ }
+
+ if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_SYN,
+ opts->backup, opts->join_id);
+ put_unaligned_be32(opts->token, ptr);
+ ptr += 1;
+ put_unaligned_be32(opts->nonce, ptr);
+ ptr += 1;
+ }
+
+ if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_SYNACK,
+ opts->backup, opts->join_id);
+ put_unaligned_be64(opts->thmac, ptr);
+ ptr += 2;
+ put_unaligned_be32(opts->nonce, ptr);
+ ptr += 1;
+ }
+
+ if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
+ memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
+ ptr += 5;
+ }
+
if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
struct mptcp_ext *mpext = &opts->ext_copy;
u8 len = TCPOLEN_MPTCP_DSS_BASE;
@@ -579,10 +1013,7 @@ mp_capable_done:
flags |= MPTCP_DSS_DATA_FIN;
}
- *ptr++ = htonl((TCPOPT_MPTCP << 24) |
- (len << 16) |
- (MPTCPOPT_DSS << 12) |
- (flags));
+ *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
if (mpext->use_ack) {
put_unaligned_be64(mpext->data_ack, ptr);
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
new file mode 100644
index 000000000000..064639f72487
--- /dev/null
+++ b/net/mptcp/pm.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+
+static struct workqueue_struct *pm_wq;
+
+/* path manager command handlers */
+
+int mptcp_pm_announce_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ pr_debug("msk=%p, local_id=%d", msk, addr->id);
+
+ msk->pm.local = *addr;
+ WRITE_ONCE(msk->pm.addr_signal, true);
+ return 0;
+}
+
+int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id)
+{
+ return -ENOTSUPP;
+}
+
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id)
+{
+ return -ENOTSUPP;
+}
+
+/* path manager event handlers */
+
+void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
+
+ WRITE_ONCE(pm->server_side, server_side);
+}
+
+bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+ int ret;
+
+ pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
+ pm->subflows_max, READ_ONCE(pm->accept_subflow));
+
+ /* try to avoid acquiring the lock below */
+ if (!READ_ONCE(pm->accept_subflow))
+ return false;
+
+ spin_lock_bh(&pm->lock);
+ ret = pm->subflows < pm->subflows_max;
+ if (ret && ++pm->subflows == pm->subflows_max)
+ WRITE_ONCE(pm->accept_subflow, false);
+ spin_unlock_bh(&pm->lock);
+
+ return ret;
+}
+
+/* return true if the new status bit is currently cleared, that is, this event
+ * can be server, eventually by an already scheduled work
+ */
+static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
+ enum mptcp_pm_status new_status)
+{
+ pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
+ BIT(new_status));
+ if (msk->pm.status & BIT(new_status))
+ return false;
+
+ msk->pm.status |= BIT(new_status);
+ if (queue_work(pm_wq, &msk->pm.work))
+ sock_hold((struct sock *)msk);
+ return true;
+}
+
+void mptcp_pm_fully_established(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p", msk);
+
+ /* try to avoid acquiring the lock below */
+ if (!READ_ONCE(pm->work_pending))
+ return;
+
+ spin_lock_bh(&pm->lock);
+
+ if (READ_ONCE(pm->work_pending))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_connection_closed(struct mptcp_sock *msk)
+{
+ pr_debug("msk=%p", msk);
+}
+
+void mptcp_pm_subflow_established(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p", msk);
+
+ if (!READ_ONCE(pm->work_pending))
+ return;
+
+ spin_lock_bh(&pm->lock);
+
+ if (READ_ONCE(pm->work_pending))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id)
+{
+ pr_debug("msk=%p", msk);
+}
+
+void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
+ READ_ONCE(pm->accept_addr));
+
+ /* avoid acquiring the lock if there is no room for fouther addresses */
+ if (!READ_ONCE(pm->accept_addr))
+ return;
+
+ spin_lock_bh(&pm->lock);
+
+ /* be sure there is something to signal re-checking under PM lock */
+ if (READ_ONCE(pm->accept_addr) &&
+ mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED))
+ pm->remote = *addr;
+
+ spin_unlock_bh(&pm->lock);
+}
+
+/* path manager helpers */
+
+bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_addr_info *saddr)
+{
+ int ret = false;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ /* double check after the lock is acquired */
+ if (!mptcp_pm_should_signal(msk))
+ goto out_unlock;
+
+ if (remaining < mptcp_add_addr_len(msk->pm.local.family))
+ goto out_unlock;
+
+ *saddr = msk->pm.local;
+ WRITE_ONCE(msk->pm.addr_signal, false);
+ ret = true;
+
+out_unlock:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
+{
+ return mptcp_pm_nl_get_local_id(msk, skc);
+}
+
+static void pm_worker(struct work_struct *work)
+{
+ struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data,
+ work);
+ struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm);
+ struct sock *sk = (struct sock *)msk;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+
+ pr_debug("msk=%p status=%x", msk, pm->status);
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
+ mptcp_pm_nl_add_addr_received(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
+ mptcp_pm_nl_fully_established(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
+ mptcp_pm_nl_subflow_established(msk);
+ }
+
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+ sock_put(sk);
+}
+
+void mptcp_pm_data_init(struct mptcp_sock *msk)
+{
+ msk->pm.add_addr_signaled = 0;
+ msk->pm.add_addr_accepted = 0;
+ msk->pm.local_addr_used = 0;
+ msk->pm.subflows = 0;
+ WRITE_ONCE(msk->pm.work_pending, false);
+ WRITE_ONCE(msk->pm.addr_signal, false);
+ WRITE_ONCE(msk->pm.accept_addr, false);
+ WRITE_ONCE(msk->pm.accept_subflow, false);
+ msk->pm.status = 0;
+
+ spin_lock_init(&msk->pm.lock);
+ INIT_WORK(&msk->pm.work, pm_worker);
+
+ mptcp_pm_nl_data_init(msk);
+}
+
+void mptcp_pm_close(struct mptcp_sock *msk)
+{
+ if (cancel_work_sync(&msk->pm.work))
+ sock_put((struct sock *)msk);
+}
+
+void mptcp_pm_init(void)
+{
+ pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
+ if (!pm_wq)
+ panic("Failed to allocate workqueue");
+
+ mptcp_pm_nl_init();
+}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
new file mode 100644
index 000000000000..a0ce7f324499
--- /dev/null
+++ b/net/mptcp/pm_netlink.c
@@ -0,0 +1,857 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2020, Red Hat, Inc.
+ */
+
+#include <linux/inet.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/netns/generic.h>
+#include <net/mptcp.h>
+#include <net/genetlink.h>
+#include <uapi/linux/mptcp.h>
+
+#include "protocol.h"
+
+/* forward declaration */
+static struct genl_family mptcp_genl_family;
+
+static int pm_nl_pernet_id;
+
+struct mptcp_pm_addr_entry {
+ struct list_head list;
+ unsigned int flags;
+ int ifindex;
+ struct mptcp_addr_info addr;
+ struct rcu_head rcu;
+};
+
+struct pm_nl_pernet {
+ /* protects pernet updates */
+ spinlock_t lock;
+ struct list_head local_addr_list;
+ unsigned int addrs;
+ unsigned int add_addr_signal_max;
+ unsigned int add_addr_accept_max;
+ unsigned int local_addr_max;
+ unsigned int subflows_max;
+ unsigned int next_id;
+};
+
+#define MPTCP_PM_ADDR_MAX 8
+
+static bool addresses_equal(const struct mptcp_addr_info *a,
+ struct mptcp_addr_info *b, bool use_port)
+{
+ bool addr_equals = false;
+
+ if (a->family != b->family)
+ return false;
+
+ if (a->family == AF_INET)
+ addr_equals = a->addr.s_addr == b->addr.s_addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else
+ addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
+#endif
+
+ if (!addr_equals)
+ return false;
+ if (!use_port)
+ return true;
+
+ return a->port == b->port;
+}
+
+static void local_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr)
+{
+ addr->port = 0;
+ addr->family = skc->skc_family;
+ if (addr->family == AF_INET)
+ addr->addr.s_addr = skc->skc_rcv_saddr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ addr->addr6 = skc->skc_v6_rcv_saddr;
+#endif
+}
+
+static void remote_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr)
+{
+ addr->family = skc->skc_family;
+ addr->port = skc->skc_dport;
+ if (addr->family == AF_INET)
+ addr->addr.s_addr = skc->skc_daddr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ addr->addr6 = skc->skc_v6_daddr;
+#endif
+}
+
+static bool lookup_subflow_by_saddr(const struct list_head *list,
+ struct mptcp_addr_info *saddr)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_addr_info cur;
+ struct sock_common *skc;
+
+ list_for_each_entry(subflow, list, node) {
+ skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
+
+ local_address(skc, &cur);
+ if (addresses_equal(&cur, saddr, false))
+ return true;
+ }
+
+ return false;
+}
+
+static struct mptcp_pm_addr_entry *
+select_local_address(const struct pm_nl_pernet *pernet,
+ struct mptcp_sock *msk)
+{
+ struct mptcp_pm_addr_entry *entry, *ret = NULL;
+
+ rcu_read_lock();
+ spin_lock_bh(&msk->join_list_lock);
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
+ continue;
+
+ /* avoid any address already in use by subflows and
+ * pending join
+ */
+ if (entry->addr.family == ((struct sock *)msk)->sk_family &&
+ !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
+ !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) {
+ ret = entry;
+ break;
+ }
+ }
+ spin_unlock_bh(&msk->join_list_lock);
+ rcu_read_unlock();
+ return ret;
+}
+
+static struct mptcp_pm_addr_entry *
+select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
+{
+ struct mptcp_pm_addr_entry *entry, *ret = NULL;
+ int i = 0;
+
+ rcu_read_lock();
+ /* do not keep any additional per socket state, just signal
+ * the address list in order.
+ * Note: removal from the local address list during the msk life-cycle
+ * can lead to additional addresses not being announced.
+ */
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
+ continue;
+ if (i++ == pos) {
+ ret = entry;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static void check_work_pending(struct mptcp_sock *msk)
+{
+ if (msk->pm.add_addr_signaled == msk->pm.add_addr_signal_max &&
+ (msk->pm.local_addr_used == msk->pm.local_addr_max ||
+ msk->pm.subflows == msk->pm.subflows_max))
+ WRITE_ONCE(msk->pm.work_pending, false);
+}
+
+static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *local;
+ struct mptcp_addr_info remote;
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+ pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
+ msk->pm.local_addr_used, msk->pm.local_addr_max,
+ msk->pm.add_addr_signaled, msk->pm.add_addr_signal_max,
+ msk->pm.subflows, msk->pm.subflows_max);
+
+ /* check first for announce */
+ if (msk->pm.add_addr_signaled < msk->pm.add_addr_signal_max) {
+ local = select_signal_address(pernet,
+ msk->pm.add_addr_signaled);
+
+ if (local) {
+ msk->pm.add_addr_signaled++;
+ mptcp_pm_announce_addr(msk, &local->addr);
+ } else {
+ /* pick failed, avoid fourther attempts later */
+ msk->pm.local_addr_used = msk->pm.add_addr_signal_max;
+ }
+
+ check_work_pending(msk);
+ }
+
+ /* check if should create a new subflow */
+ if (msk->pm.local_addr_used < msk->pm.local_addr_max &&
+ msk->pm.subflows < msk->pm.subflows_max) {
+ remote_address((struct sock_common *)sk, &remote);
+
+ local = select_local_address(pernet, msk);
+ if (local) {
+ msk->pm.local_addr_used++;
+ msk->pm.subflows++;
+ check_work_pending(msk);
+ spin_unlock_bh(&msk->pm.lock);
+ __mptcp_subflow_connect(sk, local->ifindex,
+ &local->addr, &remote);
+ spin_lock_bh(&msk->pm.lock);
+ return;
+ }
+
+ /* lookup failed, avoid fourther attempts later */
+ msk->pm.local_addr_used = msk->pm.local_addr_max;
+ check_work_pending(msk);
+ }
+}
+
+void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
+{
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
+{
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_addr_info remote;
+ struct mptcp_addr_info local;
+
+ pr_debug("accepted %d:%d remote family %d",
+ msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max,
+ msk->pm.remote.family);
+ msk->pm.add_addr_accepted++;
+ msk->pm.subflows++;
+ if (msk->pm.add_addr_accepted >= msk->pm.add_addr_accept_max ||
+ msk->pm.subflows >= msk->pm.subflows_max)
+ WRITE_ONCE(msk->pm.accept_addr, false);
+
+ /* connect to the specified remote address, using whatever
+ * local address the routing configuration will pick.
+ */
+ remote = msk->pm.remote;
+ if (!remote.port)
+ remote.port = sk->sk_dport;
+ memset(&local, 0, sizeof(local));
+ local.family = remote.family;
+
+ spin_unlock_bh(&msk->pm.lock);
+ __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote);
+ spin_lock_bh(&msk->pm.lock);
+}
+
+static bool address_use_port(struct mptcp_pm_addr_entry *entry)
+{
+ return (entry->flags &
+ (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
+ MPTCP_PM_ADDR_FLAG_SIGNAL;
+}
+
+static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_pm_addr_entry *cur;
+ int ret = -EINVAL;
+
+ spin_lock_bh(&pernet->lock);
+ /* to keep the code simple, don't do IDR-like allocation for address ID,
+ * just bail when we exceed limits
+ */
+ if (pernet->next_id > 255)
+ goto out;
+ if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
+ goto out;
+
+ /* do not insert duplicate address, differentiate on port only
+ * singled addresses
+ */
+ list_for_each_entry(cur, &pernet->local_addr_list, list) {
+ if (addresses_equal(&cur->addr, &entry->addr,
+ address_use_port(entry) &&
+ address_use_port(cur)))
+ goto out;
+ }
+
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ pernet->add_addr_signal_max++;
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ pernet->local_addr_max++;
+
+ entry->addr.id = pernet->next_id++;
+ pernet->addrs++;
+ list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
+ ret = entry->addr.id;
+
+out:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
+{
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_addr_info skc_local;
+ struct mptcp_addr_info msk_local;
+ struct pm_nl_pernet *pernet;
+ int ret = -1;
+
+ if (WARN_ON_ONCE(!msk))
+ return -1;
+
+ /* The 0 ID mapping is defined by the first subflow, copied into the msk
+ * addr
+ */
+ local_address((struct sock_common *)msk, &msk_local);
+ local_address((struct sock_common *)msk, &skc_local);
+ if (addresses_equal(&msk_local, &skc_local, false))
+ return 0;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (addresses_equal(&entry->addr, &skc_local, false)) {
+ ret = entry->addr.id;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (ret >= 0)
+ return ret;
+
+ /* address not found, add to local list */
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->flags = 0;
+ entry->addr = skc_local;
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
+ if (ret < 0)
+ kfree(entry);
+
+ return ret;
+}
+
+void mptcp_pm_nl_data_init(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+ struct pm_nl_pernet *pernet;
+ bool subflows;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+ pm->add_addr_signal_max = READ_ONCE(pernet->add_addr_signal_max);
+ pm->add_addr_accept_max = READ_ONCE(pernet->add_addr_accept_max);
+ pm->local_addr_max = READ_ONCE(pernet->local_addr_max);
+ pm->subflows_max = READ_ONCE(pernet->subflows_max);
+ subflows = !!pm->subflows_max;
+ WRITE_ONCE(pm->work_pending, (!!pm->local_addr_max && subflows) ||
+ !!pm->add_addr_signal_max);
+ WRITE_ONCE(pm->accept_addr, !!pm->add_addr_accept_max && subflows);
+ WRITE_ONCE(pm->accept_subflow, subflows);
+}
+
+#define MPTCP_PM_CMD_GRP_OFFSET 0
+
+static const struct genl_multicast_group mptcp_pm_mcgrps[] = {
+ [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, },
+};
+
+static const struct nla_policy
+mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = {
+ [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
+ [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
+ [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
+ [MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct in6_addr), },
+ [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 },
+ [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 },
+ [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 },
+};
+
+static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
+ [MPTCP_PM_ATTR_ADDR] =
+ NLA_POLICY_NESTED(mptcp_pm_addr_policy),
+ [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
+};
+
+static int mptcp_pm_family_to_addr(int family)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (family == AF_INET6)
+ return MPTCP_PM_ADDR_ATTR_ADDR6;
+#endif
+ return MPTCP_PM_ADDR_ATTR_ADDR4;
+}
+
+static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
+ bool require_family,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
+ int err, addr_addr;
+
+ if (!attr) {
+ GENL_SET_ERR_MSG(info, "missing address info");
+ return -EINVAL;
+ }
+
+ /* no validation needed - was already done via nested policy */
+ err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
+ mptcp_pm_addr_policy, info->extack);
+ if (err)
+ return err;
+
+ memset(entry, 0, sizeof(*entry));
+ if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) {
+ if (!require_family)
+ goto skip_family;
+
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "missing family");
+ return -EINVAL;
+ }
+
+ entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]);
+ if (entry->addr.family != AF_INET
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ && entry->addr.family != AF_INET6
+#endif
+ ) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "unknown address family");
+ return -EINVAL;
+ }
+ addr_addr = mptcp_pm_family_to_addr(entry->addr.family);
+ if (!tb[addr_addr]) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "missing address data");
+ return -EINVAL;
+ }
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (entry->addr.family == AF_INET6)
+ entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]);
+ else
+#endif
+ entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]);
+
+skip_family:
+ if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX])
+ entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+
+ if (tb[MPTCP_PM_ADDR_ATTR_ID])
+ entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
+
+ if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
+ entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
+
+ return 0;
+}
+
+static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
+{
+ return net_generic(genl_info_net(info), pm_nl_pernet_id);
+}
+
+static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, true, &addr);
+ if (ret < 0)
+ return ret;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "can't allocate addr");
+ return -ENOMEM;
+ }
+
+ *entry = addr;
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "too many addresses or duplicate one");
+ kfree(entry);
+ return ret;
+ }
+
+ return 0;
+}
+
+static struct mptcp_pm_addr_entry *
+__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (entry->addr.id == id)
+ return entry;
+ }
+ return NULL;
+}
+
+static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr_by_id(pernet, addr.addr.id);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "address not found");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ pernet->add_addr_signal_max--;
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ pernet->local_addr_max--;
+
+ pernet->addrs--;
+ list_del_rcu(&entry->list);
+ kfree_rcu(entry, rcu);
+out:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+static void __flush_addrs(struct pm_nl_pernet *pernet)
+{
+ while (!list_empty(&pernet->local_addr_list)) {
+ struct mptcp_pm_addr_entry *cur;
+
+ cur = list_entry(pernet->local_addr_list.next,
+ struct mptcp_pm_addr_entry, list);
+ list_del_rcu(&cur->list);
+ kfree_rcu(cur, rcu);
+ }
+}
+
+static void __reset_counters(struct pm_nl_pernet *pernet)
+{
+ pernet->add_addr_signal_max = 0;
+ pernet->add_addr_accept_max = 0;
+ pernet->local_addr_max = 0;
+ pernet->addrs = 0;
+}
+
+static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+
+ spin_lock_bh(&pernet->lock);
+ __flush_addrs(pernet);
+ __reset_counters(pernet);
+ spin_unlock_bh(&pernet->lock);
+ return 0;
+}
+
+static int mptcp_nl_fill_addr(struct sk_buff *skb,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_addr_info *addr = &entry->addr;
+ struct nlattr *attr;
+
+ attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags))
+ goto nla_put_failure;
+ if (entry->ifindex &&
+ nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
+ goto nla_put_failure;
+
+ if (addr->family == AF_INET)
+ nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4,
+ addr->addr.s_addr);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6);
+#endif
+ nla_nest_end(skb, attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, attr);
+ return -EMSGSIZE;
+}
+
+static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ struct sk_buff *msg;
+ void *reply;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
+ info->genlhdr->cmd);
+ if (!reply) {
+ GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
+ ret = -EMSGSIZE;
+ goto fail;
+ }
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr_by_id(pernet, addr.addr.id);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "address not found");
+ ret = -EINVAL;
+ goto unlock_fail;
+ }
+
+ ret = mptcp_nl_fill_addr(msg, entry);
+ if (ret)
+ goto unlock_fail;
+
+ genlmsg_end(msg, reply);
+ ret = genlmsg_reply(msg, info);
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+
+unlock_fail:
+ spin_unlock_bh(&pernet->lock);
+
+fail:
+ nlmsg_free(msg);
+ return ret;
+}
+
+static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(msg->sk);
+ struct mptcp_pm_addr_entry *entry;
+ struct pm_nl_pernet *pernet;
+ int id = cb->args[0];
+ void *hdr;
+
+ pernet = net_generic(net, pm_nl_pernet_id);
+
+ spin_lock_bh(&pernet->lock);
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (entry->addr.id <= id)
+ continue;
+
+ hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &mptcp_genl_family,
+ NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
+ if (!hdr)
+ break;
+
+ if (mptcp_nl_fill_addr(msg, entry) < 0) {
+ genlmsg_cancel(msg, hdr);
+ break;
+ }
+
+ id = entry->addr.id;
+ genlmsg_end(msg, hdr);
+ }
+ spin_unlock_bh(&pernet->lock);
+
+ cb->args[0] = id;
+ return msg->len;
+}
+
+static int parse_limit(struct genl_info *info, int id, unsigned int *limit)
+{
+ struct nlattr *attr = info->attrs[id];
+
+ if (!attr)
+ return 0;
+
+ *limit = nla_get_u32(attr);
+ if (*limit > MPTCP_PM_ADDR_MAX) {
+ GENL_SET_ERR_MSG(info, "limit greater than maximum");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int
+mptcp_nl_cmd_set_limits(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ unsigned int rcv_addrs, subflows;
+ int ret;
+
+ spin_lock_bh(&pernet->lock);
+ rcv_addrs = pernet->add_addr_accept_max;
+ ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
+ if (ret)
+ goto unlock;
+
+ subflows = pernet->subflows_max;
+ ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
+ if (ret)
+ goto unlock;
+
+ WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs);
+ WRITE_ONCE(pernet->subflows_max, subflows);
+
+unlock:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+static int
+mptcp_nl_cmd_get_limits(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct sk_buff *msg;
+ void *reply;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
+ MPTCP_PM_CMD_GET_LIMITS);
+ if (!reply)
+ goto fail;
+
+ if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
+ READ_ONCE(pernet->add_addr_accept_max)))
+ goto fail;
+
+ if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
+ READ_ONCE(pernet->subflows_max)))
+ goto fail;
+
+ genlmsg_end(msg, reply);
+ return genlmsg_reply(msg, info);
+
+fail:
+ GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+static struct genl_ops mptcp_pm_ops[] = {
+ {
+ .cmd = MPTCP_PM_CMD_ADD_ADDR,
+ .doit = mptcp_nl_cmd_add_addr,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_DEL_ADDR,
+ .doit = mptcp_nl_cmd_del_addr,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_FLUSH_ADDRS,
+ .doit = mptcp_nl_cmd_flush_addrs,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_GET_ADDR,
+ .doit = mptcp_nl_cmd_get_addr,
+ .dumpit = mptcp_nl_cmd_dump_addrs,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SET_LIMITS,
+ .doit = mptcp_nl_cmd_set_limits,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_GET_LIMITS,
+ .doit = mptcp_nl_cmd_get_limits,
+ },
+};
+
+static struct genl_family mptcp_genl_family __ro_after_init = {
+ .name = MPTCP_PM_NAME,
+ .version = MPTCP_PM_VER,
+ .maxattr = MPTCP_PM_ATTR_MAX,
+ .policy = mptcp_pm_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = mptcp_pm_ops,
+ .n_ops = ARRAY_SIZE(mptcp_pm_ops),
+ .mcgrps = mptcp_pm_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
+};
+
+static int __net_init pm_nl_init_net(struct net *net)
+{
+ struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id);
+
+ INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
+ __reset_counters(pernet);
+ pernet->next_id = 1;
+ spin_lock_init(&pernet->lock);
+ return 0;
+}
+
+static void __net_exit pm_nl_exit_net(struct list_head *net_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_list, exit_list) {
+ /* net is removed from namespace list, can't race with
+ * other modifiers
+ */
+ __flush_addrs(net_generic(net, pm_nl_pernet_id));
+ }
+}
+
+static struct pernet_operations mptcp_pm_pernet_ops = {
+ .init = pm_nl_init_net,
+ .exit_batch = pm_nl_exit_net,
+ .id = &pm_nl_pernet_id,
+ .size = sizeof(struct pm_nl_pernet),
+};
+
+void mptcp_pm_nl_init(void)
+{
+ if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
+ panic("Failed to register MPTCP PM pernet subsystem.\n");
+
+ if (genl_register_family(&mptcp_genl_family))
+ panic("Failed to register MPTCP PM netlink family\n");
+}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 3c19a8efdcea..1833bc1f4a43 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,6 +21,7 @@
#endif
#include <net/mptcp.h>
#include "protocol.h"
+#include "mib.h"
#define MPTCP_SAME_STATE TCP_MAX_STATES
@@ -31,6 +32,14 @@ struct mptcp6_sock {
};
#endif
+struct mptcp_skb_cb {
+ u32 offset;
+};
+
+#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
+
+static struct percpu_counter mptcp_sockets_allocated;
+
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket.
* Otherwise return NULL.
@@ -98,17 +107,195 @@ set_state:
return ssock;
}
-static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
+static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb,
+ unsigned int offset, size_t copy_len)
{
- struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
- sock_owned_by_me((const struct sock *)msk);
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
- mptcp_for_each_subflow(msk, subflow) {
- return mptcp_subflow_tcp_sock(subflow);
+ msk->ack_seq += copy_len;
+ MPTCP_SKB_CB(skb)->offset = offset;
+}
+
+static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
+ struct sock *ssk,
+ unsigned int *bytes)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = (struct sock *)msk;
+ unsigned int moved = 0;
+ bool more_data_avail;
+ struct tcp_sock *tp;
+ bool done = false;
+
+ if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf);
+
+ if (rcvbuf > sk->sk_rcvbuf)
+ sk->sk_rcvbuf = rcvbuf;
}
- return NULL;
+ tp = tcp_sk(ssk);
+ do {
+ u32 map_remaining, offset;
+ u32 seq = tp->copied_seq;
+ struct sk_buff *skb;
+ bool fin;
+
+ /* try to move as much data as available */
+ map_remaining = subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (!skb)
+ break;
+
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ if (fin) {
+ done = true;
+ seq++;
+ }
+
+ if (offset < skb->len) {
+ size_t len = skb->len - offset;
+
+ if (tp->urg_data)
+ done = true;
+
+ __mptcp_move_skb(msk, ssk, skb, offset, len);
+ seq += len;
+ moved += len;
+
+ if (WARN_ON_ONCE(map_remaining < len))
+ break;
+ } else {
+ WARN_ON_ONCE(!fin);
+ sk_eat_skb(ssk, skb);
+ done = true;
+ }
+
+ WRITE_ONCE(tp->copied_seq, seq);
+ more_data_avail = mptcp_subflow_data_available(ssk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) {
+ done = true;
+ break;
+ }
+ } while (more_data_avail);
+
+ *bytes = moved;
+
+ return done;
+}
+
+/* In most cases we will be able to lock the mptcp socket. If its already
+ * owned, we need to defer to the work queue to avoid ABBA deadlock.
+ */
+static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct sock *sk = (struct sock *)msk;
+ unsigned int moved = 0;
+
+ if (READ_ONCE(sk->sk_lock.owned))
+ return false;
+
+ if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock)))
+ return false;
+
+ /* must re-check after taking the lock */
+ if (!READ_ONCE(sk->sk_lock.owned))
+ __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+
+ spin_unlock_bh(&sk->sk_lock.slock);
+
+ return moved > 0;
+}
+
+void mptcp_data_ready(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+
+ if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) &&
+ move_skbs_to_msk(msk, ssk))
+ goto wake;
+
+ /* don't schedule if mptcp sk is (still) over limit */
+ if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
+ goto wake;
+
+ /* mptcp socket is owned, release_cb should retry */
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
+ &sk->sk_tsq_flags)) {
+ sock_hold(sk);
+
+ /* need to try again, its possible release_cb() has already
+ * been called after the test_and_set_bit() above.
+ */
+ move_skbs_to_msk(msk, ssk);
+ }
+wake:
+ sk->sk_data_ready(sk);
+}
+
+static void __mptcp_flush_join_list(struct mptcp_sock *msk)
+{
+ if (likely(list_empty(&msk->join_list)))
+ return;
+
+ spin_lock_bh(&msk->join_list_lock);
+ list_splice_tail_init(&msk->join_list, &msk->conn_list);
+ spin_unlock_bh(&msk->join_list_lock);
+}
+
+static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
+{
+ long tout = ssk && inet_csk(ssk)->icsk_pending ?
+ inet_csk(ssk)->icsk_timeout - jiffies : 0;
+
+ if (tout <= 0)
+ tout = mptcp_sk(sk)->timer_ival;
+ mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
+}
+
+static bool mptcp_timer_pending(struct sock *sk)
+{
+ return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
+}
+
+static void mptcp_reset_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long tout;
+
+ /* should never be called with mptcp level timer cleared */
+ tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
+ if (WARN_ON_ONCE(!tout))
+ tout = TCP_RTO_MIN;
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
+}
+
+void mptcp_data_acked(struct sock *sk)
+{
+ mptcp_reset_timer(sk);
+
+ if (!sk_stream_is_writeable(sk) &&
+ schedule_work(&mptcp_sk(sk)->work))
+ sock_hold(sk);
+}
+
+static void mptcp_stop_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ mptcp_sk(sk)->timer_ival = 0;
}
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
@@ -134,41 +321,149 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
return NULL;
}
-static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
- const struct sk_buff *skb,
- const struct mptcp_ext *mpext)
+static bool mptcp_skb_can_collapse_to(u64 write_seq,
+ const struct sk_buff *skb,
+ const struct mptcp_ext *mpext)
{
if (!tcp_skb_can_collapse_to(skb))
return false;
/* can collapse only if MPTCP level sequence is in order */
- return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
+ return mpext && mpext->data_seq + mpext->data_len == write_seq;
+}
+
+static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
+ const struct page_frag *pfrag,
+ const struct mptcp_data_frag *df)
+{
+ return df && pfrag->page == df->page &&
+ df->data_seq + df->data_len == msk->write_seq;
+}
+
+static void dfrag_uncharge(struct sock *sk, int len)
+{
+ sk_mem_uncharge(sk, len);
+ sk_wmem_queued_add(sk, -len);
+}
+
+static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
+{
+ int len = dfrag->data_len + dfrag->overhead;
+
+ list_del(&dfrag->list);
+ dfrag_uncharge(sk, len);
+ put_page(dfrag->page);
+}
+
+static void mptcp_clean_una(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *dtmp, *dfrag;
+ u64 snd_una = atomic64_read(&msk->snd_una);
+ bool cleaned = false;
+
+ list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
+ if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
+ break;
+
+ dfrag_clear(sk, dfrag);
+ cleaned = true;
+ }
+
+ dfrag = mptcp_rtx_head(sk);
+ if (dfrag && after64(snd_una, dfrag->data_seq)) {
+ u64 delta = dfrag->data_seq + dfrag->data_len - snd_una;
+
+ dfrag->data_seq += delta;
+ dfrag->data_len -= delta;
+
+ dfrag_uncharge(sk, delta);
+ cleaned = true;
+ }
+
+ if (cleaned) {
+ sk_mem_reclaim_partial(sk);
+
+ /* Only wake up writers if a subflow is ready */
+ if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ sk_stream_write_space(sk);
+ }
+}
+
+/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
+ * data
+ */
+static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
+ pfrag, sk->sk_allocation)))
+ return true;
+
+ sk->sk_prot->enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return false;
+}
+
+static struct mptcp_data_frag *
+mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
+ int orig_offset)
+{
+ int offset = ALIGN(orig_offset, sizeof(long));
+ struct mptcp_data_frag *dfrag;
+
+ dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
+ dfrag->data_len = 0;
+ dfrag->data_seq = msk->write_seq;
+ dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
+ dfrag->offset = offset + sizeof(struct mptcp_data_frag);
+ dfrag->page = pfrag->page;
+
+ return dfrag;
}
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
- struct msghdr *msg, long *timeo, int *pmss_now,
+ struct msghdr *msg, struct mptcp_data_frag *dfrag,
+ long *timeo, int *pmss_now,
int *ps_goal)
{
- int mss_now, avail_size, size_goal, ret;
+ int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
+ bool dfrag_collapsed, can_collapse = false;
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_ext *mpext = NULL;
+ bool retransmission = !!dfrag;
struct sk_buff *skb, *tail;
- bool can_collapse = false;
struct page_frag *pfrag;
+ struct page *page;
+ u64 *write_seq;
size_t psize;
/* use the mptcp page cache so that we can easily move the data
* from one substream to another, but do per subflow memory accounting
+ * Note: pfrag is used only !retransmission, but the compiler if
+ * fooled into a warning if we don't init here
*/
pfrag = sk_page_frag(sk);
- while (!sk_page_frag_refill(ssk, pfrag) ||
+ while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
!mptcp_ext_cache_refill(msk)) {
ret = sk_stream_wait_memory(ssk, timeo);
if (ret)
return ret;
+
+ /* if sk_stream_wait_memory() sleeps snd_una can change
+ * significantly, refresh the rtx queue
+ */
+ mptcp_clean_una(sk);
+
if (unlikely(__mptcp_needs_tcp_fallback(msk)))
return 0;
}
+ if (!retransmission) {
+ write_seq = &msk->write_seq;
+ page = pfrag->page;
+ } else {
+ write_seq = &dfrag->data_seq;
+ page = dfrag->page;
+ }
/* compute copy limit */
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
@@ -186,32 +481,74 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* SSN association set here
*/
can_collapse = (size_goal - skb->len > 0) &&
- mptcp_skb_can_collapse_to(msk, skb, mpext);
+ mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
if (!can_collapse)
TCP_SKB_CB(skb)->eor = 1;
else
avail_size = size_goal - skb->len;
}
- psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
-
- /* Copy to page */
- pr_debug("left=%zu", msg_data_left(msg));
- psize = copy_page_from_iter(pfrag->page, pfrag->offset,
- min_t(size_t, msg_data_left(msg), psize),
- &msg->msg_iter);
- pr_debug("left=%zu", msg_data_left(msg));
- if (!psize)
- return -EINVAL;
+
+ if (!retransmission) {
+ /* reuse tail pfrag, if possible, or carve a new one from the
+ * page allocator
+ */
+ dfrag = mptcp_rtx_tail(sk);
+ offset = pfrag->offset;
+ dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
+ if (!dfrag_collapsed) {
+ dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
+ offset = dfrag->offset;
+ frag_truesize = dfrag->overhead;
+ }
+ psize = min_t(size_t, pfrag->size - offset, avail_size);
+
+ /* Copy to page */
+ pr_debug("left=%zu", msg_data_left(msg));
+ psize = copy_page_from_iter(pfrag->page, offset,
+ min_t(size_t, msg_data_left(msg),
+ psize),
+ &msg->msg_iter);
+ pr_debug("left=%zu", msg_data_left(msg));
+ if (!psize)
+ return -EINVAL;
+
+ if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
+ return -ENOMEM;
+ } else {
+ offset = dfrag->offset;
+ psize = min_t(size_t, dfrag->data_len, avail_size);
+ }
/* tell the TCP stack to delay the push so that we can safely
* access the skb after the sendpages call
*/
- ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
+ ret = do_tcp_sendpages(ssk, page, offset, psize,
msg->msg_flags | MSG_SENDPAGE_NOTLAST);
if (ret <= 0)
return ret;
- if (unlikely(ret < psize))
- iov_iter_revert(&msg->msg_iter, psize - ret);
+
+ frag_truesize += ret;
+ if (!retransmission) {
+ if (unlikely(ret < psize))
+ iov_iter_revert(&msg->msg_iter, psize - ret);
+
+ /* send successful, keep track of sent data for mptcp-level
+ * retransmission
+ */
+ dfrag->data_len += ret;
+ if (!dfrag_collapsed) {
+ get_page(dfrag->page);
+ list_add_tail(&dfrag->list, &msk->rtx_queue);
+ sk_wmem_queued_add(sk, frag_truesize);
+ } else {
+ sk_wmem_queued_add(sk, ret);
+ }
+
+ /* charge data on mptcp rtx queue to the master socket
+ * Note: we charge such data both to sk and ssk
+ */
+ sk->sk_forward_alloc -= frag_truesize;
+ }
/* if the tail skb extension is still the cached one, collapsing
* really happened. Note: we can't check for 'same skb' as the sk_buff
@@ -230,7 +567,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
msk->cached_ext = NULL;
memset(mpext, 0, sizeof(*mpext));
- mpext->data_seq = msk->write_seq;
+ mpext->data_seq = *write_seq;
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
mpext->data_len = ret;
mpext->use_map = 1;
@@ -241,13 +578,51 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext->dsn64);
out:
- pfrag->offset += ret;
- msk->write_seq += ret;
+ if (!retransmission)
+ pfrag->offset += frag_truesize;
+ *write_seq += ret;
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
return ret;
}
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *backup = NULL;
+
+ sock_owned_by_me((const struct sock *)msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!sk_stream_memory_free(ssk)) {
+ struct socket *sock = ssk->sk_socket;
+
+ if (sock) {
+ clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+ smp_mb__after_atomic();
+
+ /* enables sk->write_space() callbacks */
+ set_bit(SOCK_NOSPACE, &sock->flags);
+ }
+
+ return NULL;
+ }
+
+ if (subflow->backup) {
+ if (!backup)
+ backup = ssk;
+
+ continue;
+ }
+
+ return ssk;
+ }
+
+ return backup;
+}
+
static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
{
struct socket *sock;
@@ -278,6 +653,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return -EOPNOTSUPP;
lock_sock(sk);
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
+ ret = sk_stream_wait_connect(sk, &timeo);
+ if (ret)
+ goto out;
+ }
+
ssock = __mptcp_tcp_fallback(msk);
if (unlikely(ssock)) {
fallback:
@@ -286,19 +670,29 @@ fallback:
return ret >= 0 ? ret + copied : (copied ? copied : ret);
}
- timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ mptcp_clean_una(sk);
- ssk = mptcp_subflow_get(msk);
- if (!ssk) {
- release_sock(sk);
- return -ENOTCONN;
+ __mptcp_flush_join_list(msk);
+ ssk = mptcp_subflow_get_send(msk);
+ while (!sk_stream_memory_free(sk) || !ssk) {
+ ret = sk_stream_wait_memory(sk, &timeo);
+ if (ret)
+ goto out;
+
+ mptcp_clean_una(sk);
+
+ ssk = mptcp_subflow_get_send(msk);
+ if (list_empty(&msk->conn_list)) {
+ ret = -ENOTCONN;
+ goto out;
+ }
}
pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk);
while (msg_data_left(msg)) {
- ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now,
+ ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
&size_goal);
if (ret < 0)
break;
@@ -311,73 +705,101 @@ fallback:
copied += ret;
}
+ mptcp_set_timeout(sk, ssk);
if (copied) {
ret = copied;
tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
size_goal);
+
+ /* start the timer, if it's not pending */
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
}
ssk_check_wmem(msk, ssk);
release_sock(ssk);
+out:
release_sock(sk);
return ret;
}
-int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb,
- unsigned int offset, size_t len)
+static void mptcp_wait_data(struct sock *sk, long *timeo)
{
- struct mptcp_read_arg *arg = desc->arg.data;
- size_t copy_len;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+
+ sk_wait_event(sk, timeo,
+ test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
+
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+}
- copy_len = min(desc->count, len);
+static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *skb;
+ int copied = 0;
- if (likely(arg->msg)) {
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ u32 offset = MPTCP_SKB_CB(skb)->offset;
+ u32 data_len = skb->len - offset;
+ u32 count = min_t(size_t, len - copied, data_len);
int err;
- err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len);
- if (err) {
- pr_debug("error path");
- desc->error = err;
- return err;
+ err = skb_copy_datagram_msg(skb, offset, msg, count);
+ if (unlikely(err < 0)) {
+ if (!copied)
+ return err;
+ break;
+ }
+
+ copied += count;
+
+ if (count < data_len) {
+ MPTCP_SKB_CB(skb)->offset += count;
+ break;
}
- } else {
- pr_debug("Flushing skb payload");
- }
- desc->count -= copy_len;
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ __kfree_skb(skb);
- pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count);
- return copy_len;
+ if (copied >= len)
+ break;
+ }
+
+ return copied;
}
-static void mptcp_wait_data(struct sock *sk, long *timeo)
+static bool __mptcp_move_skbs(struct mptcp_sock *msk)
{
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
- struct mptcp_sock *msk = mptcp_sk(sk);
+ unsigned int moved = 0;
+ bool done;
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ do {
+ struct sock *ssk = mptcp_subflow_recv_lookup(msk);
- sk_wait_event(sk, timeo,
- test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
+ if (!ssk)
+ break;
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
+ lock_sock(ssk);
+ done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ release_sock(ssk);
+ } while (!done);
+
+ return moved > 0;
}
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct mptcp_subflow_context *subflow;
- bool more_data_avail = false;
- struct mptcp_read_arg arg;
- read_descriptor_t desc;
- bool wait_data = false;
struct socket *ssock;
- struct tcp_sock *tp;
- bool done = false;
- struct sock *ssk;
int copied = 0;
int target;
long timeo;
@@ -395,65 +817,27 @@ fallback:
return copied;
}
- arg.msg = msg;
- desc.arg.data = &arg;
- desc.error = 0;
-
timeo = sock_rcvtimeo(sk, nonblock);
len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+ __mptcp_flush_join_list(msk);
- while (!done) {
- u32 map_remaining;
+ while (len > (size_t)copied) {
int bytes_read;
- ssk = mptcp_subflow_recv_lookup(msk);
- pr_debug("msk=%p ssk=%p", msk, ssk);
- if (!ssk)
- goto wait_for_data;
-
- subflow = mptcp_subflow_ctx(ssk);
- tp = tcp_sk(ssk);
-
- lock_sock(ssk);
- do {
- /* try to read as much data as available */
- map_remaining = subflow->map_data_len -
- mptcp_subflow_get_map_offset(subflow);
- desc.count = min_t(size_t, len - copied, map_remaining);
- pr_debug("reading %zu bytes, copied %d", desc.count,
- copied);
- bytes_read = tcp_read_sock(ssk, &desc,
- mptcp_read_actor);
- if (bytes_read < 0) {
- if (!copied)
- copied = bytes_read;
- done = true;
- goto next;
- }
+ bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
+ if (unlikely(bytes_read < 0)) {
+ if (!copied)
+ copied = bytes_read;
+ goto out_err;
+ }
- pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq,
- msk->ack_seq + bytes_read);
- msk->ack_seq += bytes_read;
- copied += bytes_read;
- if (copied >= len) {
- done = true;
- goto next;
- }
- if (tp->urg_data && tp->urg_seq == tp->copied_seq) {
- pr_err("Urgent data present, cannot proceed");
- done = true;
- goto next;
- }
-next:
- more_data_avail = mptcp_subflow_data_available(ssk);
- } while (more_data_avail && !done);
- release_sock(ssk);
- continue;
+ copied += bytes_read;
-wait_for_data:
- more_data_avail = false;
+ if (skb_queue_empty(&sk->sk_receive_queue) &&
+ __mptcp_move_skbs(msk))
+ continue;
/* only the master socket status is relevant here. The exit
* conditions mirror closely tcp_recvmsg()
@@ -494,30 +878,92 @@ wait_for_data:
}
pr_debug("block timeout %ld", timeo);
- wait_data = true;
mptcp_wait_data(sk, &timeo);
if (unlikely(__mptcp_tcp_fallback(msk)))
goto fallback;
}
- if (more_data_avail) {
- if (!test_bit(MPTCP_DATA_READY, &msk->flags))
- set_bit(MPTCP_DATA_READY, &msk->flags);
- } else if (!wait_data) {
+ if (skb_queue_empty(&sk->sk_receive_queue)) {
+ /* entire backlog drained, clear DATA_READY. */
clear_bit(MPTCP_DATA_READY, &msk->flags);
- /* .. race-breaker: ssk might get new data after last
- * data_available() returns false.
+ /* .. race-breaker: ssk might have gotten new data
+ * after last __mptcp_move_skbs() returned false.
*/
- ssk = mptcp_subflow_recv_lookup(msk);
- if (unlikely(ssk))
+ if (unlikely(__mptcp_move_skbs(msk)))
set_bit(MPTCP_DATA_READY, &msk->flags);
+ } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
+ /* data to read but mptcp_wait_data() cleared DATA_READY */
+ set_bit(MPTCP_DATA_READY, &msk->flags);
}
-
+out_err:
release_sock(sk);
return copied;
}
+static void mptcp_retransmit_handler(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (atomic64_read(&msk->snd_una) == msk->write_seq) {
+ mptcp_stop_timer(sk);
+ } else {
+ set_bit(MPTCP_WORK_RTX, &msk->flags);
+ if (schedule_work(&msk->work))
+ sock_hold(sk);
+ }
+}
+
+static void mptcp_retransmit_timer(struct timer_list *t)
+{
+ struct inet_connection_sock *icsk = from_timer(icsk, t,
+ icsk_retransmit_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ mptcp_retransmit_handler(sk);
+ } else {
+ /* delegate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED,
+ &sk->sk_tsq_flags))
+ sock_hold(sk);
+ }
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/* Find an idle subflow. Return NULL if there is unacked data at tcp
+ * level.
+ *
+ * A backup subflow is returned only if that is the only kind available.
+ */
+static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *backup = NULL;
+
+ sock_owned_by_me((const struct sock *)msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ /* still data outstanding at TCP level? Don't retransmit. */
+ if (!tcp_write_queue_empty(ssk))
+ return NULL;
+
+ if (subflow->backup) {
+ if (!backup)
+ backup = ssk;
+ continue;
+ }
+
+ return ssk;
+ }
+
+ return backup;
+}
+
/* subflow sockets can be either outgoing (connect) or incoming
* (accept).
*
@@ -548,28 +994,135 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
return 0;
}
+static void mptcp_worker(struct work_struct *work)
+{
+ struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
+ struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
+ int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0;
+ struct mptcp_data_frag *dfrag;
+ u64 orig_write_seq;
+ size_t copied = 0;
+ struct msghdr msg;
+ long timeo = 0;
+
+ lock_sock(sk);
+ mptcp_clean_una(sk);
+ __mptcp_flush_join_list(msk);
+ __mptcp_move_skbs(msk);
+
+ if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
+ goto unlock;
+
+ dfrag = mptcp_rtx_head(sk);
+ if (!dfrag)
+ goto unlock;
+
+ ssk = mptcp_subflow_get_retrans(msk);
+ if (!ssk)
+ goto reset_unlock;
+
+ lock_sock(ssk);
+
+ msg.msg_flags = MSG_DONTWAIT;
+ orig_len = dfrag->data_len;
+ orig_offset = dfrag->offset;
+ orig_write_seq = dfrag->data_seq;
+ while (dfrag->data_len > 0) {
+ ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now,
+ &size_goal);
+ if (ret < 0)
+ break;
+
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
+ copied += ret;
+ dfrag->data_len -= ret;
+ dfrag->offset += ret;
+ }
+ if (copied)
+ tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
+ size_goal);
+
+ dfrag->data_seq = orig_write_seq;
+ dfrag->offset = orig_offset;
+ dfrag->data_len = orig_len;
+
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+
+reset_unlock:
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+
+unlock:
+ release_sock(sk);
+ sock_put(sk);
+}
+
static int __mptcp_init_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ spin_lock_init(&msk->join_list_lock);
+
INIT_LIST_HEAD(&msk->conn_list);
+ INIT_LIST_HEAD(&msk->join_list);
+ INIT_LIST_HEAD(&msk->rtx_queue);
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
+ INIT_WORK(&msk->work, mptcp_worker);
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
+ mptcp_pm_data_init(msk);
+
+ /* re-use the csk retrans timer for MPTCP-level retrans */
+ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
+
return 0;
}
static int mptcp_init_sock(struct sock *sk)
{
- if (!mptcp_is_enabled(sock_net(sk)))
+ struct net *net = sock_net(sk);
+ int ret;
+
+ if (!mptcp_is_enabled(net))
return -ENOPROTOOPT;
- return __mptcp_init_sock(sk);
+ if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
+ return -ENOMEM;
+
+ ret = __mptcp_init_sock(sk);
+ if (ret)
+ return ret;
+
+ sk_sockets_allocated_inc(sk);
+ sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
+
+ return 0;
}
-static void mptcp_subflow_shutdown(struct sock *ssk, int how)
+static void __mptcp_clear_xmit(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *dtmp, *dfrag;
+
+ sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+
+ list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
+ dfrag_clear(sk, dfrag);
+}
+
+static void mptcp_cancel_work(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (cancel_work_sync(&msk->work))
+ sock_put(sk);
+}
+
+static void mptcp_subflow_shutdown(struct sock *ssk, int how,
+ bool data_fin_tx_enable, u64 data_fin_tx_seq)
{
lock_sock(ssk);
@@ -582,6 +1135,14 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how)
tcp_disconnect(ssk, O_NONBLOCK);
break;
default:
+ if (data_fin_tx_enable) {
+ struct mptcp_subflow_context *subflow;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->data_fin_tx_seq = data_fin_tx_seq;
+ subflow->data_fin_tx_enable = 1;
+ }
+
ssk->sk_shutdown |= how;
tcp_shutdown(ssk, how);
break;
@@ -598,22 +1159,36 @@ static void mptcp_close(struct sock *sk, long timeout)
struct mptcp_subflow_context *subflow, *tmp;
struct mptcp_sock *msk = mptcp_sk(sk);
LIST_HEAD(conn_list);
+ u64 data_fin_tx_seq;
lock_sock(sk);
mptcp_token_destroy(msk->token);
inet_sk_state_store(sk, TCP_CLOSE);
+ __mptcp_flush_join_list(msk);
+
list_splice_init(&msk->conn_list, &conn_list);
+ data_fin_tx_seq = msk->write_seq;
+
+ __mptcp_clear_xmit(sk);
+
release_sock(sk);
list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ subflow->data_fin_tx_seq = data_fin_tx_seq;
+ subflow->data_fin_tx_enable = 1;
__mptcp_close_ssk(sk, ssk, subflow, timeout);
}
+ mptcp_cancel_work(sk);
+ mptcp_pm_close(msk);
+
+ __skb_queue_purge(&sk->sk_receive_queue);
+
sk_common_release(sk);
}
@@ -640,6 +1215,15 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
}
+static int mptcp_disconnect(struct sock *sk, int flags)
+{
+ lock_sock(sk);
+ __mptcp_clear_xmit(sk);
+ release_sock(sk);
+ mptcp_cancel_work(sk);
+ return tcp_disconnect(sk, flags);
+}
+
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
{
@@ -649,9 +1233,12 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
}
#endif
-static struct sock *mptcp_sk_clone_lock(const struct sock *sk)
+struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req)
{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
+ struct mptcp_sock *msk;
+ u64 ack_seq;
if (!nsk)
return NULL;
@@ -661,6 +1248,40 @@ static struct sock *mptcp_sk_clone_lock(const struct sock *sk)
inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
#endif
+ __mptcp_init_sock(nsk);
+
+ msk = mptcp_sk(nsk);
+ msk->local_key = subflow_req->local_key;
+ msk->token = subflow_req->token;
+ msk->subflow = NULL;
+
+ if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) {
+ bh_unlock_sock(nsk);
+
+ /* we can't call into mptcp_close() here - possible BH context
+ * free the sock directly
+ */
+ nsk->sk_prot->destroy(nsk);
+ sk_free(nsk);
+ return NULL;
+ }
+
+ msk->write_seq = subflow_req->idsn + 1;
+ atomic64_set(&msk->snd_una, msk->write_seq);
+ if (subflow_req->remote_key_valid) {
+ msk->can_ack = true;
+ msk->remote_key = subflow_req->remote_key;
+ mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
+ ack_seq++;
+ msk->ack_seq = ack_seq;
+ }
+
+ /* will be fully established after successful MPC subflow creation */
+ inet_sk_state_store(nsk, TCP_SYN_RECV);
+ bh_unlock_sock(nsk);
+
+ /* keep a single reference */
+ __sock_put(nsk);
return nsk;
}
@@ -688,62 +1309,37 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
struct mptcp_subflow_context *subflow;
struct sock *new_mptcp_sock;
struct sock *ssk = newsk;
- u64 ack_seq;
subflow = mptcp_subflow_ctx(newsk);
- lock_sock(sk);
+ new_mptcp_sock = subflow->conn;
- local_bh_disable();
- new_mptcp_sock = mptcp_sk_clone_lock(sk);
- if (!new_mptcp_sock) {
- *err = -ENOBUFS;
- local_bh_enable();
- release_sock(sk);
- mptcp_subflow_shutdown(newsk, SHUT_RDWR + 1);
- tcp_close(newsk, 0);
- return NULL;
+ /* is_mptcp should be false if subflow->conn is missing, see
+ * subflow_syn_recv_sock()
+ */
+ if (WARN_ON_ONCE(!new_mptcp_sock)) {
+ tcp_sk(newsk)->is_mptcp = 0;
+ return newsk;
}
- __mptcp_init_sock(new_mptcp_sock);
+ /* acquire the 2nd reference for the owning socket */
+ sock_hold(new_mptcp_sock);
+ local_bh_disable();
+ bh_lock_sock(new_mptcp_sock);
msk = mptcp_sk(new_mptcp_sock);
- msk->local_key = subflow->local_key;
- msk->token = subflow->token;
- msk->subflow = NULL;
msk->first = newsk;
- mptcp_token_update_accept(newsk, new_mptcp_sock);
-
- msk->write_seq = subflow->idsn + 1;
- if (subflow->can_ack) {
- msk->can_ack = true;
- msk->remote_key = subflow->remote_key;
- mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
- ack_seq++;
- msk->ack_seq = ack_seq;
- }
newsk = new_mptcp_sock;
mptcp_copy_inaddrs(newsk, ssk);
list_add(&subflow->node, &msk->conn_list);
- /* will be fully established at mptcp_stream_accept()
- * completion.
- */
- inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV);
bh_unlock_sock(new_mptcp_sock);
- local_bh_enable();
- release_sock(sk);
- /* the subflow can already receive packet, avoid racing with
- * the receive path and process the pending ones
- */
- lock_sock(ssk);
- subflow->rel_write_seq = 1;
- subflow->tcp_sock = ssk;
- subflow->conn = new_mptcp_sock;
- if (unlikely(!skb_queue_empty(&ssk->sk_receive_queue)))
- mptcp_subflow_data_available(ssk);
- release_sock(ssk);
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
+ local_bh_enable();
+ } else {
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
}
return newsk;
@@ -755,6 +1351,8 @@ static void mptcp_destroy(struct sock *sk)
if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
+
+ sk_sockets_allocated_dec(sk);
}
static int mptcp_setsockopt(struct sock *sk, int level, int optname,
@@ -807,6 +1405,40 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
return -EOPNOTSUPP;
}
+#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \
+ TCPF_WRITE_TIMER_DEFERRED)
+
+/* this is very alike tcp_release_cb() but we must handle differently a
+ * different set of events
+ */
+static void mptcp_release_cb(struct sock *sk)
+{
+ unsigned long flags, nflags;
+
+ do {
+ flags = sk->sk_tsq_flags;
+ if (!(flags & MPTCP_DEFERRED_ALL))
+ return;
+ nflags = flags & ~MPTCP_DEFERRED_ALL;
+ } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
+
+ sock_release_ownership(sk);
+
+ if (flags & TCPF_DELACK_TIMER_DEFERRED) {
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_recv_lookup(msk);
+ if (!ssk || !schedule_work(&msk->work))
+ __sock_put(sk);
+ }
+
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
+ mptcp_retransmit_handler(sk);
+ __sock_put(sk);
+ }
+}
+
static int mptcp_get_port(struct sock *sk, unsigned short snum)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -828,13 +1460,15 @@ void mptcp_finish_connect(struct sock *ssk)
u64 ack_seq;
subflow = mptcp_subflow_ctx(ssk);
-
- if (!subflow->mp_capable)
- return;
-
sk = subflow->conn;
msk = mptcp_sk(sk);
+ if (!subflow->mp_capable) {
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
+ return;
+ }
+
pr_debug("msk=%p, token=%u", sk, subflow->token);
mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
@@ -852,6 +1486,9 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->can_ack, 1);
+ atomic64_set(&msk->snd_una, msk->write_seq);
+
+ mptcp_pm_new_connection(msk, 0);
}
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
@@ -863,6 +1500,46 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
write_unlock_bh(&sk->sk_callback_lock);
}
+bool mptcp_finish_join(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct sock *parent = (void *)msk;
+ struct socket *parent_sock;
+ bool ret;
+
+ pr_debug("msk=%p, subflow=%p", msk, subflow);
+
+ /* mptcp socket already closing? */
+ if (inet_sk_state_load(parent) != TCP_ESTABLISHED)
+ return false;
+
+ if (!msk->pm.server_side)
+ return true;
+
+ /* passive connection, attach to msk socket */
+ parent_sock = READ_ONCE(parent->sk_socket);
+ if (parent_sock && !sk->sk_socket)
+ mptcp_sock_graft(sk, parent_sock);
+
+ ret = mptcp_pm_allow_new_subflow(msk);
+ if (ret) {
+ /* active connections are already on conn_list */
+ spin_lock_bh(&msk->join_list_lock);
+ if (!WARN_ON_ONCE(!list_empty(&subflow->node)))
+ list_add_tail(&subflow->node, &msk->join_list);
+ spin_unlock_bh(&msk->join_list_lock);
+ }
+ return ret;
+}
+
+bool mptcp_sk_is_subflow(const struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ return subflow->mp_join == 1;
+}
+
static bool mptcp_memory_free(const struct sock *sk, int wake)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -874,6 +1551,7 @@ static struct proto mptcp_prot = {
.name = "MPTCP",
.owner = THIS_MODULE,
.init = mptcp_init_sock,
+ .disconnect = mptcp_disconnect,
.close = mptcp_close,
.accept = mptcp_accept,
.setsockopt = mptcp_setsockopt,
@@ -882,10 +1560,16 @@ static struct proto mptcp_prot = {
.destroy = mptcp_destroy,
.sendmsg = mptcp_sendmsg,
.recvmsg = mptcp_recvmsg,
+ .release_cb = mptcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = mptcp_get_port,
+ .sockets_allocated = &mptcp_sockets_allocated,
+ .memory_allocated = &tcp_memory_allocated,
+ .memory_pressure = &tcp_memory_pressure,
.stream_memory_free = mptcp_memory_free,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+ .sysctl_mem = sysctl_tcp_mem,
.obj_size = sizeof(struct mptcp_sock),
.no_autobind = true,
};
@@ -1041,14 +1725,13 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
*/
+ __mptcp_flush_join_list(msk);
list_for_each_entry(subflow, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!ssk->sk_socket)
mptcp_sock_graft(ssk, newsock);
}
-
- inet_sk_state_store(newsock->sk, TCP_ESTABLISHED);
}
sock_put(ssock->sk);
@@ -1124,10 +1807,11 @@ static int mptcp_shutdown(struct socket *sock, int how)
sock->state = SS_CONNECTED;
}
+ __mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
- mptcp_subflow_shutdown(tcp_sk, how);
+ mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq);
}
out_unlock:
@@ -1174,12 +1858,18 @@ void mptcp_proto_init(void)
{
mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
+ if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
+ panic("Failed to allocate MPTCP pcpu counter\n");
+
mptcp_subflow_init();
+ mptcp_pm_init();
if (proto_register(&mptcp_prot, 1) != 0)
panic("Failed to register MPTCP proto.\n");
inet_register_protosw(&mptcp_protosw);
+
+ BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 9f8663b30456..f733c5425552 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -17,6 +17,12 @@
#define OPTION_MPTCP_MPC_SYN BIT(0)
#define OPTION_MPTCP_MPC_SYNACK BIT(1)
#define OPTION_MPTCP_MPC_ACK BIT(2)
+#define OPTION_MPTCP_MPJ_SYN BIT(3)
+#define OPTION_MPTCP_MPJ_SYNACK BIT(4)
+#define OPTION_MPTCP_MPJ_ACK BIT(5)
+#define OPTION_MPTCP_ADD_ADDR BIT(6)
+#define OPTION_MPTCP_ADD_ADDR6 BIT(7)
+#define OPTION_MPTCP_RM_ADDR BIT(8)
/* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE 0
@@ -33,12 +39,30 @@
#define TCPOLEN_MPTCP_MPC_SYNACK 12
#define TCPOLEN_MPTCP_MPC_ACK 20
#define TCPOLEN_MPTCP_MPC_ACK_DATA 22
+#define TCPOLEN_MPTCP_MPJ_SYN 12
+#define TCPOLEN_MPTCP_MPJ_SYNACK 16
+#define TCPOLEN_MPTCP_MPJ_ACK 24
#define TCPOLEN_MPTCP_DSS_BASE 4
#define TCPOLEN_MPTCP_DSS_ACK32 4
#define TCPOLEN_MPTCP_DSS_ACK64 8
#define TCPOLEN_MPTCP_DSS_MAP32 10
#define TCPOLEN_MPTCP_DSS_MAP64 14
#define TCPOLEN_MPTCP_DSS_CHECKSUM 2
+#define TCPOLEN_MPTCP_ADD_ADDR 16
+#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE 8
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10
+#define TCPOLEN_MPTCP_ADD_ADDR6 28
+#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22
+#define TCPOLEN_MPTCP_PORT_LEN 2
+#define TCPOLEN_MPTCP_RM_ADDR_BASE 4
+
+/* MPTCP MP_JOIN flags */
+#define MPTCPOPT_BACKUP BIT(0)
+#define MPTCPOPT_HMAC_LEN 20
+#define MPTCPOPT_THMAC_LEN 8
/* MPTCP MP_CAPABLE flags */
#define MPTCP_VERSION_MASK (0x0F)
@@ -55,9 +79,75 @@
#define MPTCP_DSS_HAS_ACK BIT(0)
#define MPTCP_DSS_FLAG_MASK (0x1F)
+/* MPTCP ADD_ADDR flags */
+#define MPTCP_ADDR_ECHO BIT(0)
+#define MPTCP_ADDR_HMAC_LEN 20
+#define MPTCP_ADDR_IPVERSION_4 4
+#define MPTCP_ADDR_IPVERSION_6 6
+
/* MPTCP socket flags */
#define MPTCP_DATA_READY 0
#define MPTCP_SEND_SPACE 1
+#define MPTCP_WORK_RTX 2
+
+static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
+{
+ return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) |
+ ((nib & 0xF) << 8) | field);
+}
+
+#define MPTCP_PM_MAX_ADDR 4
+
+struct mptcp_addr_info {
+ sa_family_t family;
+ __be16 port;
+ u8 id;
+ union {
+ struct in_addr addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ struct in6_addr addr6;
+#endif
+ };
+};
+
+enum mptcp_pm_status {
+ MPTCP_PM_ADD_ADDR_RECEIVED,
+ MPTCP_PM_ESTABLISHED,
+ MPTCP_PM_SUBFLOW_ESTABLISHED,
+};
+
+struct mptcp_pm_data {
+ struct mptcp_addr_info local;
+ struct mptcp_addr_info remote;
+
+ spinlock_t lock; /*protects the whole PM data */
+
+ bool addr_signal;
+ bool server_side;
+ bool work_pending;
+ bool accept_addr;
+ bool accept_subflow;
+ u8 add_addr_signaled;
+ u8 add_addr_accepted;
+ u8 local_addr_used;
+ u8 subflows;
+ u8 add_addr_signal_max;
+ u8 add_addr_accept_max;
+ u8 local_addr_max;
+ u8 subflows_max;
+ u8 status;
+
+ struct work_struct work;
+};
+
+struct mptcp_data_frag {
+ struct list_head list;
+ u64 data_seq;
+ int data_len;
+ int offset;
+ int overhead;
+ struct page *page;
+};
/* MPTCP connection sock */
struct mptcp_sock {
@@ -67,13 +157,20 @@ struct mptcp_sock {
u64 remote_key;
u64 write_seq;
u64 ack_seq;
+ atomic64_t snd_una;
+ unsigned long timer_ival;
u32 token;
unsigned long flags;
bool can_ack;
+ spinlock_t join_list_lock;
+ struct work_struct work;
struct list_head conn_list;
+ struct list_head rtx_queue;
+ struct list_head join_list;
struct skb_ext *cached_ext; /* for the next sendmsg */
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first;
+ struct mptcp_pm_data pm;
};
#define mptcp_for_each_subflow(__msk, __subflow) \
@@ -84,17 +181,42 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
return (struct mptcp_sock *)sk;
}
+static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (list_empty(&msk->rtx_queue))
+ return NULL;
+
+ return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
+}
+
+static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (list_empty(&msk->rtx_queue))
+ return NULL;
+
+ return list_first_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
+}
+
struct mptcp_subflow_request_sock {
struct tcp_request_sock sk;
u16 mp_capable : 1,
mp_join : 1,
backup : 1,
remote_key_valid : 1;
+ u8 local_id;
+ u8 remote_id;
u64 local_key;
u64 remote_key;
u64 idsn;
u32 token;
u32 ssn_offset;
+ u64 thmac;
+ u32 local_nonce;
+ u32 remote_nonce;
};
static inline struct mptcp_subflow_request_sock *
@@ -117,14 +239,28 @@ struct mptcp_subflow_context {
u32 ssn_offset;
u32 map_data_len;
u32 request_mptcp : 1, /* send MP_CAPABLE */
+ request_join : 1, /* send MP_JOIN */
+ request_bkup : 1,
mp_capable : 1, /* remote is MPTCP capable */
- fourth_ack : 1, /* send initial DSS */
+ mp_join : 1, /* remote is JOINing */
+ fully_established : 1, /* path validated */
+ pm_notified : 1, /* PM hook called for established status */
conn_finished : 1,
map_valid : 1,
mpc_map : 1,
+ backup : 1,
data_avail : 1,
rx_eof : 1,
+ data_fin_tx_enable : 1,
can_ack : 1; /* only after processing the remote a key */
+ u64 data_fin_tx_seq;
+ u32 remote_nonce;
+ u64 thmac;
+ u32 local_nonce;
+ u32 remote_token;
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ u8 local_id;
+ u8 remote_id;
struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */
@@ -168,6 +304,11 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
int mptcp_is_enabled(struct net *net);
bool mptcp_subflow_data_available(struct sock *sk);
void mptcp_subflow_init(void);
+
+/* called with sk socket lock held */
+int __mptcp_subflow_connect(struct sock *sk, int ifindex,
+ const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
@@ -190,23 +331,20 @@ void mptcp_proto_init(void);
int mptcp_proto_v6_init(void);
#endif
-struct mptcp_read_arg {
- struct msghdr *msg;
-};
-
-int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb,
- unsigned int offset, size_t len);
-
+struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req);
void mptcp_get_options(const struct sk_buff *skb,
struct tcp_options_received *opt_rx);
void mptcp_finish_connect(struct sock *sk);
+void mptcp_data_ready(struct sock *sk, struct sock *ssk);
+bool mptcp_finish_join(struct sock *sk);
+void mptcp_data_acked(struct sock *sk);
int mptcp_token_new_request(struct request_sock *req);
void mptcp_token_destroy_request(u32 token);
int mptcp_token_new_connect(struct sock *sk);
-int mptcp_token_new_accept(u32 token);
-void mptcp_token_update_accept(struct sock *sk, struct sock *conn);
+int mptcp_token_new_accept(u32 token, struct sock *conn);
+struct mptcp_sock *mptcp_token_get_sock(u32 token);
void mptcp_token_destroy(u32 token);
void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
@@ -222,8 +360,48 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
mptcp_crypto_key_sha(*key, token, idsn);
}
-void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
- void *hash_out);
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
+
+void mptcp_pm_init(void);
+void mptcp_pm_data_init(struct mptcp_sock *msk);
+void mptcp_pm_close(struct mptcp_sock *msk);
+void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side);
+void mptcp_pm_fully_established(struct mptcp_sock *msk);
+bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
+void mptcp_pm_connection_closed(struct mptcp_sock *msk);
+void mptcp_pm_subflow_established(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow);
+void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id);
+void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+
+int mptcp_pm_announce_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id);
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id);
+
+static inline bool mptcp_pm_should_signal(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.addr_signal);
+}
+
+static inline unsigned int mptcp_add_addr_len(int family)
+{
+ if (family == AF_INET)
+ return TCPOLEN_MPTCP_ADD_ADDR;
+ return TCPOLEN_MPTCP_ADD_ADDR6;
+}
+
+bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_addr_info *saddr);
+int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
+
+void mptcp_pm_nl_init(void);
+void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
+void mptcp_pm_nl_fully_established(struct mptcp_sock *msk);
+void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk);
+void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk);
+int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
{
@@ -237,4 +415,6 @@ static inline bool before64(__u64 seq1, __u64 seq2)
#define after64(seq2, seq1) before64(seq1, seq2)
+void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
+
#endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 65122edf60aa..b5180c81588e 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netdevice.h>
+#include <crypto/algapi.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
@@ -19,17 +20,42 @@
#endif
#include <net/mptcp.h>
#include "protocol.h"
+#include "mib.h"
+
+static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
+ enum linux_mptcp_mib_field field)
+{
+ MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
+}
static int subflow_rebuild_header(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- int err = 0;
+ int local_id, err = 0;
if (subflow->request_mptcp && !subflow->token) {
pr_debug("subflow=%p", sk);
err = mptcp_token_new_connect(sk);
+ } else if (subflow->request_join && !subflow->local_nonce) {
+ struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
+
+ pr_debug("subflow=%p", sk);
+
+ do {
+ get_random_bytes(&subflow->local_nonce, sizeof(u32));
+ } while (!subflow->local_nonce);
+
+ if (subflow->local_id)
+ goto out;
+
+ local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
+ if (local_id < 0)
+ return -EINVAL;
+
+ subflow->local_id = local_id;
}
+out:
if (err)
return err;
@@ -47,6 +73,51 @@ static void subflow_req_destructor(struct request_sock *req)
tcp_request_sock_ops.destructor(req);
}
+static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
+ void *hmac)
+{
+ u8 msg[8];
+
+ put_unaligned_be32(nonce1, &msg[0]);
+ put_unaligned_be32(nonce2, &msg[4]);
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
+}
+
+/* validate received token and create truncated hmac and nonce for SYN-ACK */
+static bool subflow_token_join_request(struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ struct mptcp_sock *msk;
+ int local_id;
+
+ msk = mptcp_token_get_sock(subflow_req->token);
+ if (!msk) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
+ return false;
+ }
+
+ local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
+ if (local_id < 0) {
+ sock_put((struct sock *)msk);
+ return false;
+ }
+ subflow_req->local_id = local_id;
+
+ get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
+
+ subflow_generate_hmac(msk->local_key, msk->remote_key,
+ subflow_req->local_nonce,
+ subflow_req->remote_nonce, hmac);
+
+ subflow_req->thmac = get_unaligned_be64(hmac);
+
+ sock_put((struct sock *)msk);
+ return true;
+}
+
static void subflow_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
@@ -61,6 +132,7 @@ static void subflow_init_req(struct request_sock *req,
mptcp_get_options(skb, &rx_opt);
subflow_req->mp_capable = 0;
+ subflow_req->mp_join = 0;
subflow_req->remote_key_valid = 0;
#ifdef CONFIG_TCP_MD5SIG
@@ -71,6 +143,15 @@ static void subflow_init_req(struct request_sock *req,
return;
#endif
+ if (rx_opt.mptcp.mp_capable) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
+
+ if (rx_opt.mptcp.mp_join)
+ return;
+ } else if (rx_opt.mptcp.mp_join) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
+ }
+
if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
int err;
@@ -79,6 +160,19 @@ static void subflow_init_req(struct request_sock *req,
subflow_req->mp_capable = 1;
subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+ } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+ subflow_req->mp_join = 1;
+ subflow_req->backup = rx_opt.mptcp.backup;
+ subflow_req->remote_id = rx_opt.mptcp.join_id;
+ subflow_req->token = rx_opt.mptcp.token;
+ subflow_req->remote_nonce = rx_opt.mptcp.nonce;
+ pr_debug("token=%u, remote_nonce=%u", subflow_req->token,
+ subflow_req->remote_nonce);
+ if (!subflow_token_join_request(req, skb)) {
+ subflow_req->mp_join = 0;
+ // @@ need to trigger RST
+ }
}
}
@@ -106,13 +200,41 @@ static void subflow_v6_init_req(struct request_sock *req,
}
#endif
+/* validate received truncated hmac and create hmac for third ACK */
+static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
+{
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ u64 thmac;
+
+ subflow_generate_hmac(subflow->remote_key, subflow->local_key,
+ subflow->remote_nonce, subflow->local_nonce,
+ hmac);
+
+ thmac = get_unaligned_be64(hmac);
+ pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
+ subflow, subflow->token,
+ (unsigned long long)thmac,
+ (unsigned long long)subflow->thmac);
+
+ return thmac == subflow->thmac;
+}
+
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
- if (subflow->conn && !subflow->conn_finished) {
+ if (inet_sk_state_load(parent) != TCP_ESTABLISHED) {
+ inet_sk_state_store(parent, TCP_ESTABLISHED);
+ parent->sk_state_change(parent);
+ }
+
+ if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp)
+ return;
+
+ if (subflow->mp_capable) {
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
subflow->remote_key);
mptcp_finish_connect(sk);
@@ -122,6 +244,33 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
}
+ } else if (subflow->mp_join) {
+ pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
+ subflow, subflow->thmac,
+ subflow->remote_nonce);
+ if (!subflow_thmac_valid(subflow)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
+ subflow->mp_join = 0;
+ goto do_reset;
+ }
+
+ subflow_generate_hmac(subflow->local_key, subflow->remote_key,
+ subflow->local_nonce,
+ subflow->remote_nonce,
+ subflow->hmac);
+
+ if (skb)
+ subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+
+ if (!mptcp_finish_join(sk))
+ goto do_reset;
+
+ subflow->conn_finished = 1;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
+ } else {
+do_reset:
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
}
}
@@ -172,6 +321,32 @@ drop:
}
#endif
+/* validate hmac received in third ACK */
+static bool subflow_hmac_valid(const struct request_sock *req,
+ const struct tcp_options_received *rx_opt)
+{
+ const struct mptcp_subflow_request_sock *subflow_req;
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ struct mptcp_sock *msk;
+ bool ret;
+
+ subflow_req = mptcp_subflow_rsk(req);
+ msk = mptcp_token_get_sock(subflow_req->token);
+ if (!msk)
+ return false;
+
+ subflow_generate_hmac(msk->remote_key, msk->local_key,
+ subflow_req->remote_nonce,
+ subflow_req->local_nonce, hmac);
+
+ ret = true;
+ if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac)))
+ ret = false;
+
+ sock_put((struct sock *)msk);
+ return ret;
+}
+
static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct sk_buff *skb,
struct request_sock *req,
@@ -182,6 +357,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
struct mptcp_subflow_request_sock *subflow_req;
struct tcp_options_received opt_rx;
+ bool fallback_is_fatal = false;
+ struct sock *new_msk = NULL;
struct sock *child;
pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
@@ -197,7 +374,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
* out-of-order pkt, which will not carry the MP_CAPABLE
* opt even on mptcp enabled paths
*/
- goto create_child;
+ goto create_msk;
}
opt_rx.mptcp.mp_capable = 0;
@@ -207,6 +384,21 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
subflow_req->remote_key_valid = 1;
} else {
subflow_req->mp_capable = 0;
+ goto create_child;
+ }
+
+create_msk:
+ new_msk = mptcp_sk_clone(listener->conn, req);
+ if (!new_msk)
+ subflow_req->mp_capable = 0;
+ } else if (subflow_req->mp_join) {
+ fallback_is_fatal = true;
+ opt_rx.mptcp.mp_join = 0;
+ mptcp_get_options(skb, &opt_rx);
+ if (!opt_rx.mptcp.mp_join ||
+ !subflow_hmac_valid(req, &opt_rx)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
+ return NULL;
}
}
@@ -217,22 +409,45 @@ create_child:
if (child && *own_req) {
struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
- /* we have null ctx on TCP fallback, not fatal on MPC
- * handshake
+ /* we have null ctx on TCP fallback, which is fatal on
+ * MPJ handshake
*/
- if (!ctx)
- return child;
+ if (!ctx) {
+ if (fallback_is_fatal)
+ goto close_child;
+ goto out;
+ }
if (ctx->mp_capable) {
- if (mptcp_token_new_accept(ctx->token))
+ /* new mpc subflow takes ownership of the newly
+ * created mptcp socket
+ */
+ inet_sk_state_store(new_msk, TCP_ESTABLISHED);
+ mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
+ ctx->conn = new_msk;
+ new_msk = NULL;
+ } else if (ctx->mp_join) {
+ struct mptcp_sock *owner;
+
+ owner = mptcp_token_get_sock(ctx->token);
+ if (!owner)
goto close_child;
+
+ ctx->conn = (struct sock *)owner;
+ if (!mptcp_finish_join(child))
+ goto close_child;
+
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
}
}
+out:
+ /* dispose of the left over mptcp master, if any */
+ if (unlikely(new_msk))
+ sock_put(new_msk);
return child;
close_child:
- pr_debug("closing child socket");
tcp_send_active_reset(child, GFP_ATOMIC);
inet_csk_prepare_forced_close(child);
tcp_done(child);
@@ -338,6 +553,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
data_len = mpext->data_len;
if (data_len == 0) {
pr_err("Infinite mapping not handled");
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
return MAPPING_INVALID;
}
@@ -381,8 +597,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
/* If this skb data are fully covered by the current mapping,
* the new map would need caching, which is not supported
*/
- if (skb_is_fully_mapped(ssk, skb))
+ if (skb_is_fully_mapped(ssk, skb)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
return MAPPING_INVALID;
+ }
/* will validate the next map after consuming the current one */
return MAPPING_OK;
@@ -408,6 +626,18 @@ validate_seq:
return MAPPING_OK;
}
+static int subflow_read_actor(read_descriptor_t *desc,
+ struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ size_t copy_len = min(desc->count, len);
+
+ desc->count -= copy_len;
+
+ pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
+ return copy_len;
+}
+
static bool subflow_check_data_avail(struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
@@ -420,9 +650,6 @@ static bool subflow_check_data_avail(struct sock *ssk)
if (subflow->data_avail)
return true;
- if (!subflow->conn)
- return false;
-
msk = mptcp_sk(subflow->conn);
for (;;) {
u32 map_remaining;
@@ -482,16 +709,12 @@ static bool subflow_check_data_avail(struct sock *ssk)
pr_debug("discarding %zu bytes, current map len=%d", delta,
map_remaining);
if (delta) {
- struct mptcp_read_arg arg = {
- .msg = NULL,
- };
read_descriptor_t desc = {
.count = delta,
- .arg.data = &arg,
};
int ret;
- ret = tcp_read_sock(ssk, &desc, mptcp_read_actor);
+ ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
if (ret < 0) {
ssk->sk_err = -ret;
goto fatal;
@@ -546,19 +769,15 @@ static void subflow_data_ready(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
- if (!parent || !subflow->mp_capable) {
+ if (!subflow->mp_capable && !subflow->mp_join) {
subflow->tcp_data_ready(sk);
- if (parent)
- parent->sk_data_ready(parent);
+ parent->sk_data_ready(parent);
return;
}
- if (mptcp_subflow_data_available(sk)) {
- set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
-
- parent->sk_data_ready(parent);
- }
+ if (mptcp_subflow_data_available(sk))
+ mptcp_data_ready(parent, sk);
}
static void subflow_write_space(struct sock *sk)
@@ -567,7 +786,7 @@ static void subflow_write_space(struct sock *sk)
struct sock *parent = subflow->conn;
sk_stream_write_space(sk);
- if (parent && sk_stream_is_writeable(sk)) {
+ if (sk_stream_is_writeable(sk)) {
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
@@ -605,6 +824,85 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
}
#endif
+static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+ struct sockaddr_storage *addr)
+{
+ memset(addr, 0, sizeof(*addr));
+ addr->ss_family = info->family;
+ if (addr->ss_family == AF_INET) {
+ struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
+
+ in_addr->sin_addr = info->addr;
+ in_addr->sin_port = info->port;
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->ss_family == AF_INET6) {
+ struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
+
+ in6_addr->sin6_addr = info->addr6;
+ in6_addr->sin6_port = info->port;
+ }
+#endif
+}
+
+int __mptcp_subflow_connect(struct sock *sk, int ifindex,
+ const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *remote)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
+ struct sockaddr_storage addr;
+ struct socket *sf;
+ u32 remote_token;
+ int addrlen;
+ int err;
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+
+ err = mptcp_subflow_create_socket(sk, &sf);
+ if (err)
+ return err;
+
+ subflow = mptcp_subflow_ctx(sf->sk);
+ subflow->remote_key = msk->remote_key;
+ subflow->local_key = msk->local_key;
+ subflow->token = msk->token;
+ mptcp_info2sockaddr(loc, &addr);
+
+ addrlen = sizeof(struct sockaddr_in);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (loc->family == AF_INET6)
+ addrlen = sizeof(struct sockaddr_in6);
+#endif
+ sf->sk->sk_bound_dev_if = ifindex;
+ err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
+ if (err)
+ goto failed;
+
+ mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
+ pr_debug("msk=%p remote_token=%u", msk, remote_token);
+ subflow->remote_token = remote_token;
+ subflow->local_id = loc->id;
+ subflow->request_join = 1;
+ subflow->request_bkup = 1;
+ mptcp_info2sockaddr(remote, &addr);
+
+ err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
+ if (err && err != -EINPROGRESS)
+ goto failed;
+
+ spin_lock_bh(&msk->join_list_lock);
+ list_add_tail(&subflow->node, &msk->join_list);
+ spin_unlock_bh(&msk->join_list_lock);
+
+ return err;
+
+failed:
+ sock_release(sf);
+ return err;
+}
+
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{
struct mptcp_subflow_context *subflow;
@@ -682,7 +980,7 @@ static bool subflow_is_done(const struct sock *sk)
static void subflow_state_change(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sock *parent = READ_ONCE(subflow->conn);
+ struct sock *parent = subflow->conn;
__subflow_state_change(sk);
@@ -690,13 +988,10 @@ static void subflow_state_change(struct sock *sk)
* a fin packet carrying a DSS can be unnoticed if we don't trigger
* the data available machinery here.
*/
- if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) {
- set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
-
- parent->sk_data_ready(parent);
- }
+ if (subflow->mp_capable && mptcp_subflow_data_available(sk))
+ mptcp_data_ready(parent, sk);
- if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) &&
+ if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
!subflow->rx_eof && subflow_is_done(sk)) {
subflow->rx_eof = 1;
parent->sk_shutdown |= RCV_SHUTDOWN;
@@ -772,7 +1067,8 @@ static void subflow_ulp_clone(const struct request_sock *req,
struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
struct mptcp_subflow_context *new_ctx;
- if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) {
+ if (!tcp_rsk(req)->is_mptcp ||
+ (!subflow_req->mp_capable && !subflow_req->mp_join)) {
subflow_ulp_fallback(newsk, old_ctx);
return;
}
@@ -783,22 +1079,35 @@ static void subflow_ulp_clone(const struct request_sock *req,
return;
}
- /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully
- * established only after we receive the remote key
- */
new_ctx->conn_finished = 1;
new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
new_ctx->tcp_state_change = old_ctx->tcp_state_change;
new_ctx->tcp_write_space = old_ctx->tcp_write_space;
- new_ctx->mp_capable = 1;
- new_ctx->fourth_ack = subflow_req->remote_key_valid;
- new_ctx->can_ack = subflow_req->remote_key_valid;
- new_ctx->remote_key = subflow_req->remote_key;
- new_ctx->local_key = subflow_req->local_key;
- new_ctx->token = subflow_req->token;
- new_ctx->ssn_offset = subflow_req->ssn_offset;
- new_ctx->idsn = subflow_req->idsn;
+ new_ctx->rel_write_seq = 1;
+ new_ctx->tcp_sock = newsk;
+
+ if (subflow_req->mp_capable) {
+ /* see comments in subflow_syn_recv_sock(), MPTCP connection
+ * is fully established only after we receive the remote key
+ */
+ new_ctx->mp_capable = 1;
+ new_ctx->fully_established = subflow_req->remote_key_valid;
+ new_ctx->can_ack = subflow_req->remote_key_valid;
+ new_ctx->remote_key = subflow_req->remote_key;
+ new_ctx->local_key = subflow_req->local_key;
+ new_ctx->token = subflow_req->token;
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->idsn = subflow_req->idsn;
+ } else if (subflow_req->mp_join) {
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->mp_join = 1;
+ new_ctx->fully_established = 1;
+ new_ctx->backup = subflow_req->backup;
+ new_ctx->local_id = subflow_req->local_id;
+ new_ctx->token = subflow_req->token;
+ new_ctx->thmac = subflow_req->thmac;
+ }
}
static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
@@ -860,6 +1169,8 @@ void mptcp_subflow_init(void)
subflow_v6m_specific.net_frag_header_len = 0;
#endif
+ mptcp_diag_subflow_init(&subflow_ulp_ops);
+
if (tcp_register_ulp(&subflow_ulp_ops) != 0)
panic("MPTCP: failed to register subflows to ULP\n");
}
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 84d887806090..129a5ad1bc35 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -128,43 +128,43 @@ int mptcp_token_new_connect(struct sock *sk)
*
* Called when a SYN packet creates a new logical connection, i.e.
* is not a join request.
- *
- * We don't have an mptcp socket yet at that point.
- * This is paired with mptcp_token_update_accept, called on accept().
*/
-int mptcp_token_new_accept(u32 token)
+int mptcp_token_new_accept(u32 token, struct sock *conn)
{
int err;
spin_lock_bh(&token_tree_lock);
- err = radix_tree_insert(&token_tree, token, &token_used);
+ err = radix_tree_insert(&token_tree, token, conn);
spin_unlock_bh(&token_tree_lock);
return err;
}
/**
- * mptcp_token_update_accept - update token to map to mptcp socket
- * @conn: the new struct mptcp_sock
- * @sk: the initial subflow for this mptcp socket
+ * mptcp_token_get_sock - retrieve mptcp connection sock using its token
+ * @token: token of the mptcp connection to retrieve
+ *
+ * This function returns the mptcp connection structure with the given token.
+ * A reference count on the mptcp socket returned is taken.
*
- * Called when the first mptcp socket is created on accept to
- * refresh the dummy mapping (done to reserve the token) with
- * the mptcp_socket structure that wasn't allocated before.
+ * returns NULL if no connection with the given token value exists.
*/
-void mptcp_token_update_accept(struct sock *sk, struct sock *conn)
+struct mptcp_sock *mptcp_token_get_sock(u32 token)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- void __rcu **slot;
+ struct sock *conn;
spin_lock_bh(&token_tree_lock);
- slot = radix_tree_lookup_slot(&token_tree, subflow->token);
- WARN_ON_ONCE(!slot);
- if (slot) {
- WARN_ON_ONCE(rcu_access_pointer(*slot) != &token_used);
- radix_tree_replace_slot(&token_tree, slot, conn);
+ conn = radix_tree_lookup(&token_tree, token);
+ if (conn) {
+ /* token still reserved? */
+ if (conn == (struct sock *)&token_used)
+ conn = NULL;
+ else
+ sock_hold(conn);
}
spin_unlock_bh(&token_tree_lock);
+
+ return mptcp_sk(conn);
}
/**