summaryrefslogtreecommitdiff
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/crypto.c2
-rw-r--r--net/mptcp/ctrl.c218
-rw-r--r--net/mptcp/diag.c5
-rw-r--r--net/mptcp/fastopen.c4
-rw-r--r--net/mptcp/mib.c12
-rw-r--r--net/mptcp/mib.h14
-rw-r--r--net/mptcp/mptcp_diag.c2
-rw-r--r--net/mptcp/mptcp_pm_gen.c10
-rw-r--r--net/mptcp/mptcp_pm_gen.h2
-rw-r--r--net/mptcp/options.c112
-rw-r--r--net/mptcp/pm.c86
-rw-r--r--net/mptcp/pm_netlink.c568
-rw-r--r--net/mptcp/pm_userspace.c461
-rw-r--r--net/mptcp/protocol.c349
-rw-r--r--net/mptcp/protocol.h247
-rw-r--r--net/mptcp/sched.c24
-rw-r--r--net/mptcp/sockopt.c167
-rw-r--r--net/mptcp/subflow.c291
-rw-r--r--net/mptcp/token_test.c7
19 files changed, 1729 insertions, 852 deletions
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index a8931349933c..b08ba959ac4f 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -22,7 +22,7 @@
#include <linux/kernel.h>
#include <crypto/sha2.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include "protocol.h"
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index 13fe0748dde8..2dd81e6c26bd 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -12,6 +12,7 @@
#include <net/netns/generic.h>
#include "protocol.h"
+#include "mib.h"
#define MPTCP_SYSCTL_PATH "net/mptcp"
@@ -27,8 +28,12 @@ struct mptcp_pernet {
#endif
unsigned int add_addr_timeout;
+ unsigned int blackhole_timeout;
unsigned int close_timeout;
unsigned int stale_loss_cnt;
+ atomic_t active_disable_times;
+ u8 syn_retrans_before_tcp_fallback;
+ unsigned long active_disable_stamp;
u8 mptcp_enabled;
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
@@ -87,15 +92,88 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
pernet->add_addr_timeout = TCP_RTO_MAX;
+ pernet->blackhole_timeout = 3600;
+ pernet->syn_retrans_before_tcp_fallback = 2;
+ atomic_set(&pernet->active_disable_times, 0);
pernet->close_timeout = TCP_TIMEWAIT_LEN;
pernet->checksum_enabled = 0;
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
- strcpy(pernet->scheduler, "default");
+ strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler));
}
#ifdef CONFIG_SYSCTL
+static int mptcp_set_scheduler(char *scheduler, const char *name)
+{
+ struct mptcp_sched_ops *sched;
+ int ret = 0;
+
+ rcu_read_lock();
+ sched = mptcp_sched_find(name);
+ if (sched)
+ strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX);
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int proc_scheduler(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data;
+ char val[MPTCP_SCHED_NAME_MAX];
+ struct ctl_table tbl = {
+ .data = val,
+ .maxlen = MPTCP_SCHED_NAME_MAX,
+ };
+ int ret;
+
+ strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = mptcp_set_scheduler(*scheduler, val);
+
+ return ret;
+}
+
+static int proc_available_schedulers(const struct ctl_table *ctl,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = MPTCP_SCHED_BUF_MAX, };
+ int ret;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+ if (!tbl.data)
+ return -ENOMEM;
+
+ mptcp_get_available_schedulers(tbl.data, MPTCP_SCHED_BUF_MAX);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ kfree(tbl.data);
+
+ return ret;
+}
+
+static int proc_blackhole_detect_timeout(const struct ctl_table *table,
+ int write, void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct mptcp_pernet *pernet = container_of(table->data,
+ struct mptcp_pernet,
+ blackhole_timeout);
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ atomic_set(&pernet->active_disable_times, 0);
+
+ return ret;
+}
+
static struct ctl_table mptcp_sysctl_table[] = {
{
.procname = "enabled",
@@ -148,7 +226,13 @@ static struct ctl_table mptcp_sysctl_table[] = {
.procname = "scheduler",
.maxlen = MPTCP_SCHED_NAME_MAX,
.mode = 0644,
- .proc_handler = proc_dostring,
+ .proc_handler = proc_scheduler,
+ },
+ {
+ .procname = "available_schedulers",
+ .maxlen = MPTCP_SCHED_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_available_schedulers,
},
{
.procname = "close_timeout",
@@ -156,7 +240,19 @@ static struct ctl_table mptcp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {}
+ {
+ .procname = "blackhole_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_blackhole_detect_timeout,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "syn_retrans_before_tcp_fallback",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
};
static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
@@ -178,7 +274,10 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[4].data = &pernet->stale_loss_cnt;
table[5].data = &pernet->pm_type;
table[6].data = &pernet->scheduler;
- table[7].data = &pernet->close_timeout;
+ /* table[7] is for available_schedulers which is read-only info */
+ table[8].data = &pernet->close_timeout;
+ table[9].data = &pernet->blackhole_timeout;
+ table[10].data = &pernet->syn_retrans_before_tcp_fallback;
hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
ARRAY_SIZE(mptcp_sysctl_table));
@@ -198,7 +297,7 @@ err_alloc:
static void mptcp_pernet_del_table(struct mptcp_pernet *pernet)
{
- struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;
+ const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;
unregister_net_sysctl_table(pernet->ctl_table_hdr);
@@ -216,6 +315,115 @@ static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {}
#endif /* CONFIG_SYSCTL */
+/* The following code block is to deal with middle box issues with MPTCP,
+ * similar to what is done with TFO.
+ * The proposed solution is to disable active MPTCP globally when SYN+MPC are
+ * dropped, while SYN without MPC aren't. In this case, active side MPTCP is
+ * disabled globally for 1hr at first. Then if it happens again, it is disabled
+ * for 2h, then 4h, 8h, ...
+ * The timeout is reset back to 1hr when a successful active MPTCP connection is
+ * fully established.
+ */
+
+/* Disable active MPTCP and record current jiffies and active_disable_times */
+void mptcp_active_disable(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct mptcp_pernet *pernet;
+
+ pernet = mptcp_get_pernet(net);
+
+ if (!READ_ONCE(pernet->blackhole_timeout))
+ return;
+
+ /* Paired with READ_ONCE() in mptcp_active_should_disable() */
+ WRITE_ONCE(pernet->active_disable_stamp, jiffies);
+
+ /* Paired with smp_rmb() in mptcp_active_should_disable().
+ * We want pernet->active_disable_stamp to be updated first.
+ */
+ smp_mb__before_atomic();
+ atomic_inc(&pernet->active_disable_times);
+
+ MPTCP_INC_STATS(net, MPTCP_MIB_BLACKHOLE);
+}
+
+/* Calculate timeout for MPTCP active disable
+ * Return true if we are still in the active MPTCP disable period
+ * Return false if timeout already expired and we should use active MPTCP
+ */
+bool mptcp_active_should_disable(struct sock *ssk)
+{
+ struct net *net = sock_net(ssk);
+ unsigned int blackhole_timeout;
+ struct mptcp_pernet *pernet;
+ unsigned long timeout;
+ int disable_times;
+ int multiplier;
+
+ pernet = mptcp_get_pernet(net);
+ blackhole_timeout = READ_ONCE(pernet->blackhole_timeout);
+
+ if (!blackhole_timeout)
+ return false;
+
+ disable_times = atomic_read(&pernet->active_disable_times);
+ if (!disable_times)
+ return false;
+
+ /* Paired with smp_mb__before_atomic() in mptcp_active_disable() */
+ smp_rmb();
+
+ /* Limit timeout to max: 2^6 * initial timeout */
+ multiplier = 1 << min(disable_times - 1, 6);
+
+ /* Paired with the WRITE_ONCE() in mptcp_active_disable(). */
+ timeout = READ_ONCE(pernet->active_disable_stamp) +
+ multiplier * blackhole_timeout * HZ;
+
+ return time_before(jiffies, timeout);
+}
+
+/* Enable active MPTCP and reset active_disable_times if needed */
+void mptcp_active_enable(struct sock *sk)
+{
+ struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk));
+
+ if (atomic_read(&pernet->active_disable_times)) {
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ if (dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))
+ atomic_set(&pernet->active_disable_times, 0);
+ }
+}
+
+/* Check the number of retransmissions, and fallback to TCP if needed */
+void mptcp_active_detect_blackhole(struct sock *ssk, bool expired)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (!sk_is_mptcp(ssk))
+ return;
+
+ subflow = mptcp_subflow_ctx(ssk);
+
+ if (subflow->request_mptcp && ssk->sk_state == TCP_SYN_SENT) {
+ struct net *net = sock_net(ssk);
+ u8 timeouts, to_max;
+
+ timeouts = inet_csk(ssk)->icsk_retransmits;
+ to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback;
+
+ if (timeouts == to_max || (timeouts < to_max && expired)) {
+ MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP);
+ subflow->mpc_drop = 1;
+ mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow);
+ }
+ } else if (ssk->sk_state == TCP_SYN_SENT) {
+ subflow->mpc_drop = 0;
+ }
+}
+
static int __net_init mptcp_net_init(struct net *net)
{
struct mptcp_pernet *pernet = mptcp_get_pernet(net);
diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
index 7017dd60659d..02205f7994d7 100644
--- a/net/mptcp/diag.c
+++ b/net/mptcp/diag.c
@@ -10,7 +10,6 @@
#include <linux/net.h>
#include <linux/inet_diag.h>
#include <net/netlink.h>
-#include <uapi/linux/mptcp.h>
#include "protocol.h"
static int subflow_get_info(struct sock *sk, struct sk_buff *skb)
@@ -48,7 +47,7 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb)
flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM;
if (sf->request_bkup)
flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC;
- if (sf->fully_established)
+ if (READ_ONCE(sf->fully_established))
flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED;
if (sf->conn_finished)
flags |= MPTCP_SUBFLOW_FLAG_CONNECTED;
@@ -95,7 +94,7 @@ static size_t subflow_get_info_size(const struct sock *sk)
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */
nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */
- nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */
nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */
diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c
index ad28da655f8b..a29ff901df75 100644
--- a/net/mptcp/fastopen.c
+++ b/net/mptcp/fastopen.c
@@ -68,12 +68,12 @@ void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflo
skb = skb_peek_tail(&sk->sk_receive_queue);
if (skb) {
WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq);
- pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx", sk,
+ pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx\n", sk,
MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq,
MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq);
MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq;
MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq;
}
- pr_debug("msk=%p ack_seq=%llx", msk, msk->ack_seq);
+ pr_debug("msk=%p ack_seq=%llx\n", msk, msk->ack_seq);
}
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index c30405e76833..19eb9292bd60 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -15,15 +15,26 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
+ SNMP_MIB_ITEM("MPCapableSYNTXDrop", MPTCP_MIB_MPCAPABLEACTIVEDROP),
+ SNMP_MIB_ITEM("MPCapableSYNTXDisabled", MPTCP_MIB_MPCAPABLEACTIVEDISABLED),
+ SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT),
SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT),
SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX),
+ SNMP_MIB_ITEM("MPJoinSynBackupRx", MPTCP_MIB_JOINSYNBACKUPRX),
SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX),
+ SNMP_MIB_ITEM("MPJoinSynAckBackupRx", MPTCP_MIB_JOINSYNACKBACKUPRX),
SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC),
SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
+ SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX),
+ SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR),
+ SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR),
+ SNMP_MIB_ITEM("MPJoinSynTxConnectErr", MPTCP_MIB_JOINSYNTXCONNECTERR),
SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
+ SNMP_MIB_ITEM("DSSCorruptionFallback", MPTCP_MIB_DSSCORRUPTIONFALLBACK),
+ SNMP_MIB_ITEM("DSSCorruptionReset", MPTCP_MIB_DSSCORRUPTIONRESET),
SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX),
SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH),
@@ -67,6 +78,7 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE),
SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT),
SNMP_MIB_ITEM("MPCurrEstab", MPTCP_MIB_CURRESTAB),
+ SNMP_MIB_ITEM("Blackhole", MPTCP_MIB_BLACKHOLE),
SNMP_MIB_SENTINEL
};
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index dd7fd1f246b5..128282982843 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -1,5 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
+#include <net/inet_common.h>
+
enum linux_mptcp_mib_field {
MPTCP_MIB_NUM = 0,
MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */
@@ -8,15 +10,26 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */
MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
+ MPTCP_MIB_MPCAPABLEACTIVEDROP, /* Client-side fallback due to a MPC drop */
+ MPTCP_MIB_MPCAPABLEACTIVEDISABLED, /* Client-side disabled due to past issues */
+ MPTCP_MIB_MPCAPABLEENDPATTEMPT, /* Prohibited MPC to port-based endp */
MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */
MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */
MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */
MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNBACKUPRX, /* Received a SYN + MP_JOIN + backup flag */
MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */
+ MPTCP_MIB_JOINSYNACKBACKUPRX, /* Received a SYN/ACK + MP_JOIN + backup flag */
MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */
MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
+ MPTCP_MIB_JOINSYNTX, /* Sending a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNTXCREATSKERR, /* Not able to create a socket when sending a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNTXCONNECTERR, /* Not able to connect() when sending a SYN + MP_JOIN */
MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
+ MPTCP_MIB_DSSCORRUPTIONFALLBACK,/* DSS corruption detected, fallback */
+ MPTCP_MIB_DSSCORRUPTIONRESET, /* DSS corruption detected, MPJ subflow reset */
MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */
MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */
@@ -66,6 +79,7 @@ enum linux_mptcp_mib_field {
*/
MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */
MPTCP_MIB_CURRESTAB, /* Current established MPTCP connections */
+ MPTCP_MIB_BLACKHOLE, /* A blackhole has been detected */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index 5409c2ea3f57..0566dd793810 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -10,7 +10,6 @@
#include <linux/net.h>
#include <linux/inet_diag.h>
#include <net/netlink.h>
-#include <uapi/linux/mptcp.h>
#include "protocol.h"
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
@@ -225,6 +224,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
}
static const struct inet_diag_handler mptcp_diag_handler = {
+ .owner = THIS_MODULE,
.dump = mptcp_diag_dump,
.dump_one = mptcp_diag_dump_one,
.idiag_get_info = mptcp_diag_get_info,
diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c
index 670da7822e6c..dcffd847af33 100644
--- a/net/mptcp/mptcp_pm_gen.c
+++ b/net/mptcp/mptcp_pm_gen.c
@@ -14,7 +14,7 @@
const struct nla_policy mptcp_pm_address_nl_policy[MPTCP_PM_ADDR_ATTR_IF_IDX + 1] = {
[MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
[MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
- [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
+ [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_BE32, },
[MPTCP_PM_ADDR_ATTR_ADDR6] = NLA_POLICY_EXACT_LEN(16),
[MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16, },
[MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32, },
@@ -32,8 +32,9 @@ const struct nla_policy mptcp_pm_del_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1]
};
/* MPTCP_PM_CMD_GET_ADDR - do */
-const struct nla_policy mptcp_pm_get_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1] = {
- [MPTCP_PM_ENDPOINT_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+const struct nla_policy mptcp_pm_get_addr_nl_policy[MPTCP_PM_ATTR_TOKEN + 1] = {
+ [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+ [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
};
/* MPTCP_PM_CMD_FLUSH_ADDRS - do */
@@ -110,8 +111,7 @@ const struct genl_ops mptcp_pm_nl_ops[11] = {
.doit = mptcp_pm_nl_get_addr_doit,
.dumpit = mptcp_pm_nl_get_addr_dumpit,
.policy = mptcp_pm_get_addr_nl_policy,
- .maxattr = MPTCP_PM_ENDPOINT_ADDR,
- .flags = GENL_UNS_ADMIN_PERM,
+ .maxattr = MPTCP_PM_ATTR_TOKEN,
},
{
.cmd = MPTCP_PM_CMD_FLUSH_ADDRS,
diff --git a/net/mptcp/mptcp_pm_gen.h b/net/mptcp/mptcp_pm_gen.h
index ac9fc7225b6a..e24258f6f819 100644
--- a/net/mptcp/mptcp_pm_gen.h
+++ b/net/mptcp/mptcp_pm_gen.h
@@ -18,7 +18,7 @@ extern const struct nla_policy mptcp_pm_add_addr_nl_policy[MPTCP_PM_ENDPOINT_ADD
extern const struct nla_policy mptcp_pm_del_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1];
-extern const struct nla_policy mptcp_pm_get_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1];
+extern const struct nla_policy mptcp_pm_get_addr_nl_policy[MPTCP_PM_ATTR_TOKEN + 1];
extern const struct nla_policy mptcp_pm_flush_addrs_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1];
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 63fc0758c22d..23949ae2a3a8 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -108,7 +108,6 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->suboptions |= OPTION_MPTCP_DSS;
mp_opt->use_map = 1;
mp_opt->mpc_map = 1;
- mp_opt->use_ack = 0;
mp_opt->data_len = get_unaligned_be16(ptr);
ptr += 2;
}
@@ -117,7 +116,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD;
ptr += 2;
}
- pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u",
+ pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u\n",
version, flags, opsize, mp_opt->sndr_key,
mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum);
break;
@@ -131,7 +130,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
ptr += 4;
mp_opt->nonce = get_unaligned_be32(ptr);
ptr += 4;
- pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
+ pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u\n",
mp_opt->backup, mp_opt->join_id,
mp_opt->token, mp_opt->nonce);
} else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
@@ -142,26 +141,21 @@ static void mptcp_parse_option(const struct sk_buff *skb,
ptr += 8;
mp_opt->nonce = get_unaligned_be32(ptr);
ptr += 4;
- pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u\n",
mp_opt->backup, mp_opt->join_id,
mp_opt->thmac, mp_opt->nonce);
} else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK;
ptr += 2;
memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
- pr_debug("MP_JOIN hmac");
+ pr_debug("MP_JOIN hmac\n");
}
break;
case MPTCPOPT_DSS:
- pr_debug("DSS");
+ pr_debug("DSS\n");
ptr++;
- /* we must clear 'mpc_map' be able to detect MP_CAPABLE
- * map vs DSS map in mptcp_incoming_options(), and reconstruct
- * map info accordingly
- */
- mp_opt->mpc_map = 0;
flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
@@ -169,7 +163,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
- pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
+ pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d\n",
mp_opt->data_fin, mp_opt->dsn64,
mp_opt->use_map, mp_opt->ack64,
mp_opt->use_ack);
@@ -207,7 +201,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
ptr += 4;
}
- pr_debug("data_ack=%llu", mp_opt->data_ack);
+ pr_debug("data_ack=%llu\n", mp_opt->data_ack);
}
if (mp_opt->use_map) {
@@ -231,7 +225,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
ptr += 2;
}
- pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n",
mp_opt->data_seq, mp_opt->subflow_seq,
mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD),
mp_opt->csum);
@@ -293,7 +287,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->ahmac = get_unaligned_be64(ptr);
ptr += 8;
}
- pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d",
+ pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d\n",
(mp_opt->addr.family == AF_INET6) ? "6" : "",
mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port));
break;
@@ -309,7 +303,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE;
for (i = 0; i < mp_opt->rm_list.nr; i++)
mp_opt->rm_list.ids[i] = *ptr++;
- pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr);
+ pr_debug("RM_ADDR: rm_list_nr=%d\n", mp_opt->rm_list.nr);
break;
case MPTCPOPT_MP_PRIO:
@@ -318,7 +312,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->suboptions |= OPTION_MPTCP_PRIO;
mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP;
- pr_debug("MP_PRIO: prio=%d", mp_opt->backup);
+ pr_debug("MP_PRIO: prio=%d\n", mp_opt->backup);
break;
case MPTCPOPT_MP_FASTCLOSE:
@@ -329,7 +323,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->rcvr_key = get_unaligned_be64(ptr);
ptr += 8;
mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE;
- pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key);
+ pr_debug("MP_FASTCLOSE: recv_key=%llu\n", mp_opt->rcvr_key);
break;
case MPTCPOPT_RST:
@@ -343,7 +337,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
flags = *ptr++;
mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT;
mp_opt->reset_reason = *ptr;
- pr_debug("MP_RST: transient=%u reason=%u",
+ pr_debug("MP_RST: transient=%u reason=%u\n",
mp_opt->reset_transient, mp_opt->reset_reason);
break;
@@ -354,7 +348,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
ptr += 2;
mp_opt->suboptions |= OPTION_MPTCP_FAIL;
mp_opt->fail_seq = get_unaligned_be64(ptr);
- pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq);
+ pr_debug("MP_FAIL: data_seq=%llu\n", mp_opt->fail_seq);
break;
default:
@@ -369,8 +363,11 @@ void mptcp_get_options(const struct sk_buff *skb,
const unsigned char *ptr;
int length;
- /* initialize option status */
- mp_opt->suboptions = 0;
+ /* Ensure that casting the whole status to u32 is efficient and safe */
+ BUILD_BUG_ON(sizeof_field(struct mptcp_options_received, status) != sizeof(u32));
+ BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_options_received, status),
+ sizeof(u32)));
+ *(u32 *)&mp_opt->status = 0;
length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
@@ -417,7 +414,7 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
*size = TCPOLEN_MPTCP_MPC_SYN;
return true;
} else if (subflow->request_join) {
- pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
+ pr_debug("remote_token=%u, nonce=%u\n", subflow->remote_token,
subflow->local_nonce);
opts->suboptions = OPTION_MPTCP_MPJ_SYN;
opts->join_id = subflow->local_id;
@@ -461,7 +458,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
return false;
/* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */
- if (subflow->fully_established || snd_data_fin_enable ||
+ if (READ_ONCE(subflow->fully_established) || snd_data_fin_enable ||
subflow->snd_isn != TCP_SKB_CB(skb)->seq ||
sk->sk_state != TCP_ESTABLISHED)
return false;
@@ -500,7 +497,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
*size = TCPOLEN_MPTCP_MPC_ACK;
}
- pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
+ pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d\n",
subflow, subflow->local_key, subflow->remote_key,
data_len);
@@ -509,7 +506,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
opts->suboptions = OPTION_MPTCP_MPJ_ACK;
memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
*size = TCPOLEN_MPTCP_MPJ_ACK;
- pr_debug("subflow=%p", subflow);
+ pr_debug("subflow=%p\n", subflow);
/* we can use the full delegate action helper only from BH context
* If we are in process context - sk is flushing the backlog at
@@ -607,7 +604,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
}
opts->ext_copy.use_ack = 1;
opts->suboptions = OPTION_MPTCP_DSS;
- WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
/* Add kind/length/subtype/flag overhead if mapping is not populated */
if (dss_size == 0)
@@ -655,6 +651,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
bool drop_other_suboptions = false;
unsigned int opt_size = *size;
+ struct mptcp_addr_info addr;
bool echo;
int len;
@@ -663,19 +660,26 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
*/
if (!mptcp_pm_should_add_signal(msk) ||
(opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) ||
- !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr,
+ !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr,
&echo, &drop_other_suboptions))
return false;
+ /*
+ * Later on, mptcp_write_options() will enforce mutually exclusion with
+ * DSS, bail out if such option is set and we can't drop it.
+ */
if (drop_other_suboptions)
remaining += opt_size;
- len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port);
+ else if (opts->suboptions & OPTION_MPTCP_DSS)
+ return false;
+
+ len = mptcp_add_addr_len(addr.family, echo, !!addr.port);
if (remaining < len)
return false;
*size = len;
if (drop_other_suboptions) {
- pr_debug("drop other suboptions");
+ pr_debug("drop other suboptions\n");
opts->suboptions = 0;
/* note that e.g. DSS could have written into the memory
@@ -686,16 +690,17 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
opts->ahmac = 0;
*size -= opt_size;
}
+ opts->addr = addr;
opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
if (!echo) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX);
- opts->ahmac = add_addr_generate_hmac(msk->local_key,
- msk->remote_key,
+ opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key),
+ READ_ONCE(msk->remote_key),
&opts->addr);
} else {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX);
}
- pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d",
+ pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n",
opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port));
return true;
@@ -726,7 +731,7 @@ static bool mptcp_established_options_rm_addr(struct sock *sk,
opts->rm_list = rm_list;
for (i = 0; i < opts->rm_list.nr; i++)
- pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]);
+ pr_debug("rm_list_ids[%d]=%d\n", i, opts->rm_list.ids[i]);
MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr);
return true;
}
@@ -752,7 +757,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk,
opts->suboptions |= OPTION_MPTCP_PRIO;
opts->backup = subflow->request_bkup;
- pr_debug("prio=%d", opts->backup);
+ pr_debug("prio=%d\n", opts->backup);
return true;
}
@@ -792,9 +797,9 @@ static bool mptcp_established_options_fastclose(struct sock *sk,
*size = TCPOLEN_MPTCP_FASTCLOSE;
opts->suboptions |= OPTION_MPTCP_FASTCLOSE;
- opts->rcvr_key = msk->remote_key;
+ opts->rcvr_key = READ_ONCE(msk->remote_key);
- pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
+ pr_debug("FASTCLOSE key=%llu\n", opts->rcvr_key);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX);
return true;
}
@@ -816,7 +821,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk,
opts->suboptions |= OPTION_MPTCP_FAIL;
opts->fail_seq = subflow->map_seq;
- pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq);
+ pr_debug("MP_FAIL fail_seq=%llu\n", opts->fail_seq);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
return true;
@@ -904,16 +909,16 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
opts->csum_reqd = subflow_req->csum_reqd;
opts->allow_join_id0 = subflow_req->allow_join_id0;
*size = TCPOLEN_MPTCP_MPC_SYNACK;
- pr_debug("subflow_req=%p, local_key=%llu",
+ pr_debug("subflow_req=%p, local_key=%llu\n",
subflow_req, subflow_req->local_key);
return true;
} else if (subflow_req->mp_join) {
opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
- opts->backup = subflow_req->backup;
+ opts->backup = subflow_req->request_bkup;
opts->join_id = subflow_req->local_id;
opts->thmac = subflow_req->thmac;
opts->nonce = subflow_req->local_nonce;
- pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u\n",
subflow_req, opts->backup, opts->join_id,
opts->thmac, opts->nonce);
*size = TCPOLEN_MPTCP_MPJ_SYNACK;
@@ -930,7 +935,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
/* here we can process OoO, in-window pkts, only in-sequence 4th ack
* will make the subflow fully established
*/
- if (likely(subflow->fully_established)) {
+ if (likely(READ_ONCE(subflow->fully_established))) {
/* on passive sockets, check for 3rd ack retransmission
* note that msk is always set by subflow_syn_recv_sock()
* for mp_join subflows
@@ -958,7 +963,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
if (subflow->remote_key_valid &&
(((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) ||
- ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) {
+ ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) &&
+ (!mp_opt->echo || subflow->mp_join)))) {
/* subflows are fully established as soon as we get any
* additional ack, including ADD_ADDR.
*/
@@ -1031,7 +1037,7 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq)
static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
{
msk->bytes_acked += new_snd_una - msk->snd_una;
- msk->snd_una = new_snd_una;
+ WRITE_ONCE(msk->snd_una, new_snd_una);
}
static void ack_update_msk(struct mptcp_sock *msk,
@@ -1058,21 +1064,22 @@ static void ack_update_msk(struct mptcp_sock *msk,
new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
if (after64(new_wnd_end, msk->wnd_end))
- msk->wnd_end = new_wnd_end;
+ WRITE_ONCE(msk->wnd_end, new_wnd_end);
/* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
- if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)))
+ if (after64(msk->wnd_end, snd_nxt))
__mptcp_check_push(sk, ssk);
if (after64(new_snd_una, old_snd_una)) {
__mptcp_snd_una_update(msk, new_snd_una);
__mptcp_data_acked(sk);
}
+ msk->last_ack_recv = tcp_jiffies32;
mptcp_data_unlock(sk);
trace_ack_update_msk(mp_opt->data_ack,
old_snd_una, new_snd_una,
- new_wnd_end, msk->wnd_end);
+ new_wnd_end, READ_ONCE(msk->wnd_end));
}
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
@@ -1100,8 +1107,8 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
if (mp_opt->echo)
return true;
- hmac = add_addr_generate_hmac(msk->remote_key,
- msk->local_key,
+ hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key),
+ READ_ONCE(msk->local_key),
&mp_opt->addr);
pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
@@ -1148,7 +1155,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) {
if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) &&
- msk->local_key == mp_opt.rcvr_key) {
+ READ_ONCE(msk->local_key) == mp_opt.rcvr_key) {
WRITE_ONCE(msk->rcv_fastclose, true);
mptcp_schedule_work((struct sock *)msk);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX);
@@ -1279,7 +1286,7 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
}
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
}
- return;
+ goto update_wspace;
}
if (rcv_wnd_new != rcv_wnd_old) {
@@ -1304,6 +1311,9 @@ raise_win:
th->window = htons(new_win);
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED);
}
+
+update_wspace:
+ WRITE_ONCE(msk->old_wspace, tp->rcv_wnd);
}
__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 4ae19113b8eb..16c336c51940 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -6,7 +6,6 @@
#define pr_fmt(fmt) "MPTCP: " fmt
#include <linux/kernel.h>
-#include <net/tcp.h>
#include <net/mptcp.h>
#include "protocol.h"
@@ -20,7 +19,7 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk,
{
u8 add_addr = READ_ONCE(msk->pm.addr_signal);
- pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo);
+ pr_debug("msk=%p, local_id=%d, echo=%d\n", msk, addr->id, echo);
lockdep_assert_held(&msk->pm.lock);
@@ -46,7 +45,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_
{
u8 rm_addr = READ_ONCE(msk->pm.addr_signal);
- pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
+ pr_debug("msk=%p, rm_list_nr=%d\n", msk, rm_list->nr);
if (rm_addr) {
MPTCP_ADD_STATS(sock_net((struct sock *)msk),
@@ -61,23 +60,13 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_
return 0;
}
-int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
-{
- pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
-
- spin_lock_bh(&msk->pm.lock);
- mptcp_pm_nl_rm_subflow_received(msk, rm_list);
- spin_unlock_bh(&msk->pm.lock);
- return 0;
-}
-
/* path manager event handlers */
void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side)
{
struct mptcp_pm_data *pm = &msk->pm;
- pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
+ pr_debug("msk=%p, token=%u side=%d\n", msk, READ_ONCE(msk->token), server_side);
WRITE_ONCE(pm->server_side, server_side);
mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC);
@@ -101,7 +90,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
subflows_max = mptcp_pm_get_subflows_max(msk);
- pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
+ pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows,
subflows_max, READ_ONCE(pm->accept_subflow));
/* try to avoid acquiring the lock below */
@@ -125,7 +114,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
enum mptcp_pm_status new_status)
{
- pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
+ pr_debug("msk=%p status=%x new=%lx\n", msk, msk->pm.status,
BIT(new_status));
if (msk->pm.status & BIT(new_status))
return false;
@@ -140,7 +129,7 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk)
struct mptcp_pm_data *pm = &msk->pm;
bool announce = false;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
spin_lock_bh(&pm->lock);
@@ -164,14 +153,17 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk)
void mptcp_pm_connection_closed(struct mptcp_sock *msk)
{
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
+
+ if (msk->token)
+ mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
}
void mptcp_pm_subflow_established(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
if (!READ_ONCE(pm->work_pending))
return;
@@ -223,7 +215,7 @@ void mptcp_pm_add_addr_received(const struct sock *ssk,
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_pm_data *pm = &msk->pm;
- pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
+ pr_debug("msk=%p remote_id=%d accept=%d\n", msk, addr->id,
READ_ONCE(pm->accept_addr));
mptcp_event_addr_announced(ssk, addr);
@@ -237,7 +229,9 @@ void mptcp_pm_add_addr_received(const struct sock *ssk,
} else {
__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
}
- } else if (!READ_ONCE(pm->accept_addr)) {
+ /* id0 should not have a different address */
+ } else if ((addr->id == 0 && !mptcp_pm_nl_is_init_remote_addr(msk, addr)) ||
+ (addr->id > 0 && !READ_ONCE(pm->accept_addr))) {
mptcp_pm_announce_addr(msk, addr, true);
mptcp_pm_add_addr_send_ack(msk);
} else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
@@ -254,7 +248,7 @@ void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
{
struct mptcp_pm_data *pm = &msk->pm;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
spin_lock_bh(&pm->lock);
@@ -278,7 +272,7 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
struct mptcp_pm_data *pm = &msk->pm;
u8 i;
- pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr);
+ pr_debug("msk=%p remote_ids_nr=%d\n", msk, rm_list->nr);
for (i = 0; i < rm_list->nr; i++)
mptcp_event_addr_removed(msk, rm_list->ids[i]);
@@ -310,19 +304,19 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
- pr_debug("fail_seq=%llu", fail_seq);
+ pr_debug("fail_seq=%llu\n", fail_seq);
if (!READ_ONCE(msk->allow_infinite_fallback))
return;
if (!subflow->fail_tout) {
- pr_debug("send MP_FAIL response and infinite map");
+ pr_debug("send MP_FAIL response and infinite map\n");
subflow->send_mp_fail = 1;
subflow->send_infinite_map = 1;
tcp_send_ack(sk);
} else {
- pr_debug("MP_FAIL response received");
+ pr_debug("MP_FAIL response received\n");
WRITE_ONCE(subflow->fail_tout, 0);
}
}
@@ -427,27 +421,39 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
return mptcp_pm_nl_get_local_id(msk, &skc_local);
}
-int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
- u8 *flags, int *ifindex)
+bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc)
{
- *flags = 0;
- *ifindex = 0;
+ struct mptcp_addr_info skc_local;
- if (!id)
- return 0;
+ mptcp_local_address((struct sock_common *)skc, &skc_local);
if (mptcp_pm_is_userspace(msk))
- return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, id, flags, ifindex);
- return mptcp_pm_nl_get_flags_and_ifindex_by_id(msk, id, flags, ifindex);
+ return mptcp_userspace_pm_is_backup(msk, &skc_local);
+
+ return mptcp_pm_nl_is_backup(msk, &skc_local);
+}
+
+int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ if (info->attrs[MPTCP_PM_ATTR_TOKEN])
+ return mptcp_userspace_pm_get_addr(skb, info);
+ return mptcp_pm_nl_get_addr(skb, info);
+}
+
+int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ const struct genl_info *info = genl_info_dump(cb);
+
+ if (info->attrs[MPTCP_PM_ATTR_TOKEN])
+ return mptcp_userspace_pm_dump_addr(msg, cb);
+ return mptcp_pm_nl_dump_addr(msg, cb);
}
-int mptcp_pm_set_flags(struct net *net, struct nlattr *token,
- struct mptcp_pm_addr_entry *loc,
- struct mptcp_pm_addr_entry *rem, u8 bkup)
+int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info)
{
- if (token)
- return mptcp_userspace_pm_set_flags(net, token, loc, rem, bkup);
- return mptcp_pm_nl_set_flags(net, loc, bkup);
+ if (info->attrs[MPTCP_PM_ATTR_TOKEN])
+ return mptcp_userspace_pm_set_flags(skb, info);
+ return mptcp_pm_nl_set_flags(skb, info);
}
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 58d17d9604e7..7868207c4e9d 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -8,18 +8,13 @@
#include <linux/inet.h>
#include <linux/kernel.h>
-#include <net/tcp.h>
#include <net/inet_common.h>
#include <net/netns/generic.h>
#include <net/mptcp.h>
-#include <net/genetlink.h>
-#include <uapi/linux/mptcp.h>
#include "protocol.h"
#include "mib.h"
-
-/* forward declaration */
-static struct genl_family mptcp_genl_family;
+#include "mptcp_pm_gen.h"
static int pm_nl_pernet_id;
@@ -112,8 +107,8 @@ static void remote_address(const struct sock_common *skc,
#endif
}
-static bool lookup_subflow_by_saddr(const struct list_head *list,
- const struct mptcp_addr_info *saddr)
+bool mptcp_lookup_subflow_by_saddr(const struct list_head *list,
+ const struct mptcp_addr_info *saddr)
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
@@ -135,12 +130,15 @@ static bool lookup_subflow_by_daddr(const struct list_head *list,
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
- struct sock_common *skc;
list_for_each_entry(subflow, list, node) {
- skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!((1 << inet_sk_state_load(ssk)) &
+ (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV)))
+ continue;
- remote_address(skc, &cur);
+ remote_address((struct sock_common *)ssk, &cur);
if (mptcp_addresses_equal(&cur, daddr, daddr->port))
return true;
}
@@ -148,11 +146,13 @@ static bool lookup_subflow_by_daddr(const struct list_head *list,
return false;
}
-static struct mptcp_pm_addr_entry *
+static bool
select_local_address(const struct pm_nl_pernet *pernet,
- const struct mptcp_sock *msk)
+ const struct mptcp_sock *msk,
+ struct mptcp_pm_local *new_local)
{
- struct mptcp_pm_addr_entry *entry, *ret = NULL;
+ struct mptcp_pm_addr_entry *entry;
+ bool found = false;
msk_owned_by_me(msk);
@@ -164,17 +164,23 @@ select_local_address(const struct pm_nl_pernet *pernet,
if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
continue;
- ret = entry;
+ new_local->addr = entry->addr;
+ new_local->flags = entry->flags;
+ new_local->ifindex = entry->ifindex;
+ found = true;
break;
}
rcu_read_unlock();
- return ret;
+
+ return found;
}
-static struct mptcp_pm_addr_entry *
-select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk)
+static bool
+select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk,
+ struct mptcp_pm_local *new_local)
{
- struct mptcp_pm_addr_entry *entry, *ret = NULL;
+ struct mptcp_pm_addr_entry *entry;
+ bool found = false;
rcu_read_lock();
/* do not keep any additional per socket state, just signal
@@ -189,11 +195,15 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk)
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
continue;
- ret = entry;
+ new_local->addr = entry->addr;
+ new_local->flags = entry->flags;
+ new_local->ifindex = entry->ifindex;
+ found = true;
break;
}
rcu_read_unlock();
- return ret;
+
+ return found;
}
unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk)
@@ -284,7 +294,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
struct mptcp_sock *msk = entry->sock;
struct sock *sk = (struct sock *)msk;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
if (!msk)
return;
@@ -303,7 +313,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
spin_lock_bh(&msk->pm.lock);
if (!mptcp_pm_should_add_signal_addr(msk)) {
- pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id);
+ pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id);
mptcp_pm_announce_addr(msk, &entry->addr, false);
mptcp_pm_add_addr_send_ack(msk);
entry->retrans_times++;
@@ -328,15 +338,21 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk,
{
struct mptcp_pm_add_entry *entry;
struct sock *sk = (struct sock *)msk;
+ struct timer_list *add_timer = NULL;
spin_lock_bh(&msk->pm.lock);
entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
- if (entry && (!check_id || entry->addr.id == addr->id))
+ if (entry && (!check_id || entry->addr.id == addr->id)) {
entry->retrans_times = ADD_ADDR_RETRANS_MAX;
+ add_timer = &entry->add_timer;
+ }
+ if (!check_id && entry)
+ list_del(&entry->list);
spin_unlock_bh(&msk->pm.lock);
- if (entry && (!check_id || entry->addr.id == addr->id))
- sk_stop_timer_sync(sk, &entry->add_timer);
+ /* no lock, because sk_stop_timer_sync() is calling del_timer_sync() */
+ if (add_timer)
+ sk_stop_timer_sync(sk, add_timer);
return entry;
}
@@ -353,7 +369,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
if (add_entry) {
- if (mptcp_pm_is_kernel(msk))
+ if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk)))
return false;
sk_reset_timer(sk, &add_entry->add_timer,
@@ -384,7 +400,7 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
struct sock *sk = (struct sock *)msk;
LIST_HEAD(free_list);
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
spin_lock_bh(&msk->pm.lock);
list_splice_init(&msk->pm.anno_list, &free_list);
@@ -470,13 +486,12 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow;
- pr_debug("send ack for %s",
+ pr_debug("send ack for %s\n",
prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"));
slow = lock_sock_fast(ssk);
if (prio) {
subflow->send_mp_prio = 1;
- subflow->backup = backup;
subflow->request_bkup = backup;
}
@@ -497,7 +512,8 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
{
struct mptcp_pm_addr_entry *entry;
- list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list,
+ lockdep_is_held(&pernet->lock)) {
if (entry->addr.id == id)
return entry;
}
@@ -505,15 +521,13 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
}
static struct mptcp_pm_addr_entry *
-__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info,
- bool lookup_by_id)
+__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
{
struct mptcp_pm_addr_entry *entry;
- list_for_each_entry(entry, &pernet->local_addr_list, list) {
- if ((!lookup_by_id &&
- mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) ||
- (lookup_by_id && entry->addr.id == info->id))
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list,
+ lockdep_is_held(&pernet->lock)) {
+ if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port))
return entry;
}
return NULL;
@@ -522,10 +536,11 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info,
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
- struct mptcp_pm_addr_entry *local;
unsigned int add_addr_signal_max;
+ bool signal_and_subflow = false;
unsigned int local_addr_max;
struct pm_nl_pernet *pernet;
+ struct mptcp_pm_local local;
unsigned int subflows_max;
pernet = pm_nl_get_pernet(sock_net(sk));
@@ -543,7 +558,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
mptcp_local_address((struct sock_common *)msk->first, &mpc_addr);
rcu_read_lock();
- entry = __lookup_addr(pernet, &mpc_addr, false);
+ entry = __lookup_addr(pernet, &mpc_addr);
if (entry) {
__clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
msk->mpc_endpoint_id = entry->addr.id;
@@ -564,8 +579,6 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
/* check first for announce */
if (msk->pm.add_addr_signaled < add_addr_signal_max) {
- local = select_signal_address(pernet, msk);
-
/* due to racing events on both ends we can reach here while
* previous add address is still running: if we invoke now
* mptcp_pm_announce_addr(), that will fail and the
@@ -576,16 +589,30 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL))
return;
- if (local) {
- if (mptcp_pm_alloc_anno_list(msk, &local->addr)) {
- __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
- msk->pm.add_addr_signaled++;
- mptcp_pm_announce_addr(msk, &local->addr, false);
- mptcp_pm_nl_addr_send_ack(msk);
- }
- }
+ if (!select_signal_address(pernet, msk, &local))
+ goto subflow;
+
+ /* If the alloc fails, we are on memory pressure, not worth
+ * continuing, and trying to create subflows.
+ */
+ if (!mptcp_pm_alloc_anno_list(msk, &local.addr))
+ return;
+
+ __clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
+ msk->pm.add_addr_signaled++;
+
+ /* Special case for ID0: set the correct ID */
+ if (local.addr.id == msk->mpc_endpoint_id)
+ local.addr.id = 0;
+
+ mptcp_pm_announce_addr(msk, &local.addr, false);
+ mptcp_pm_nl_addr_send_ack(msk);
+
+ if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ signal_and_subflow = true;
}
+subflow:
/* check if should create a new subflow */
while (msk->pm.local_addr_used < local_addr_max &&
msk->pm.subflows < subflows_max) {
@@ -593,21 +620,28 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
bool fullmesh;
int i, nr;
- local = select_local_address(pernet, msk);
- if (!local)
+ if (signal_and_subflow)
+ signal_and_subflow = false;
+ else if (!select_local_address(pernet, msk, &local))
break;
- fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
+ fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
+
+ __clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
+
+ /* Special case for ID0: set the correct ID */
+ if (local.addr.id == msk->mpc_endpoint_id)
+ local.addr.id = 0;
+ else /* local_addr_used is not decr for ID 0 */
+ msk->pm.local_addr_used++;
- msk->pm.local_addr_used++;
- __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
- nr = fill_remote_addresses_vec(msk, &local->addr, fullmesh, addrs);
+ nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs);
if (nr == 0)
continue;
spin_unlock_bh(&msk->pm.lock);
for (i = 0; i < nr; i++)
- __mptcp_subflow_connect(sk, &local->addr, &addrs[i]);
+ __mptcp_subflow_connect(sk, &local, &addrs[i]);
spin_lock_bh(&msk->pm.lock);
}
mptcp_pm_nl_check_work_pending(msk);
@@ -628,10 +662,11 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
*/
static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
struct mptcp_addr_info *remote,
- struct mptcp_addr_info *addrs)
+ struct mptcp_pm_local *locals)
{
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *entry;
+ struct mptcp_addr_info mpc_addr;
struct pm_nl_pernet *pernet;
unsigned int subflows_max;
int i = 0;
@@ -639,6 +674,8 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
pernet = pm_nl_get_pernet_from_msk(msk);
subflows_max = mptcp_pm_get_subflows_max(msk);
+ mptcp_local_address((struct sock_common *)msk, &mpc_addr);
+
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
@@ -648,8 +685,16 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
continue;
if (msk->pm.subflows < subflows_max) {
+ locals[i].addr = entry->addr;
+ locals[i].flags = entry->flags;
+ locals[i].ifindex = entry->ifindex;
+
+ /* Special case for ID0: set the correct ID */
+ if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port))
+ locals[i].addr.id = 0;
+
msk->pm.subflows++;
- addrs[i++] = entry->addr;
+ i++;
}
}
rcu_read_unlock();
@@ -658,21 +703,19 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
* 'IPADDRANY' local address
*/
if (!i) {
- struct mptcp_addr_info local;
-
- memset(&local, 0, sizeof(local));
- local.family =
+ memset(&locals[i], 0, sizeof(locals[i]));
+ locals[i].addr.family =
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
remote->family == AF_INET6 &&
ipv6_addr_v4mapped(&remote->addr6) ? AF_INET :
#endif
remote->family;
- if (!mptcp_pm_addr_families_match(sk, &local, remote))
+ if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote))
return 0;
msk->pm.subflows++;
- addrs[i++] = local;
+ i++;
}
return i;
@@ -680,17 +723,18 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
{
- struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
+ struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX];
struct sock *sk = (struct sock *)msk;
unsigned int add_addr_accept_max;
struct mptcp_addr_info remote;
unsigned int subflows_max;
+ bool sf_created = false;
int i, nr;
add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk);
subflows_max = mptcp_pm_get_subflows_max(msk);
- pr_debug("accepted %d:%d remote family %d",
+ pr_debug("accepted %d:%d remote family %d\n",
msk->pm.add_addr_accepted, add_addr_accept_max,
msk->pm.remote.family);
@@ -708,24 +752,38 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
/* connect to the specified remote address, using whatever
* local address the routing configuration will pick.
*/
- nr = fill_local_addresses_vec(msk, &remote, addrs);
+ nr = fill_local_addresses_vec(msk, &remote, locals);
if (nr == 0)
return;
- msk->pm.add_addr_accepted++;
- if (msk->pm.add_addr_accepted >= add_addr_accept_max ||
- msk->pm.subflows >= subflows_max)
- WRITE_ONCE(msk->pm.accept_addr, false);
-
spin_unlock_bh(&msk->pm.lock);
for (i = 0; i < nr; i++)
- __mptcp_subflow_connect(sk, &addrs[i], &remote);
+ if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0)
+ sf_created = true;
spin_lock_bh(&msk->pm.lock);
+
+ if (sf_created) {
+ /* add_addr_accepted is not decr for ID 0 */
+ if (remote.id)
+ msk->pm.add_addr_accepted++;
+ if (msk->pm.add_addr_accepted >= add_addr_accept_max ||
+ msk->pm.subflows >= subflows_max)
+ WRITE_ONCE(msk->pm.accept_addr, false);
+ }
+}
+
+bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *remote)
+{
+ struct mptcp_addr_info mpc_remote;
+
+ remote_address((struct sock_common *)msk, &mpc_remote);
+ return mptcp_addresses_equal(&mpc_remote, remote, remote->port);
}
void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk)
{
- struct mptcp_subflow_context *subflow;
+ struct mptcp_subflow_context *subflow, *alt = NULL;
msk_owned_by_me(msk);
lockdep_assert_held(&msk->pm.lock);
@@ -734,9 +792,20 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk)
!mptcp_pm_should_rm_signal(msk))
return;
- subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node);
- if (subflow)
- mptcp_pm_send_ack(msk, subflow, false, false);
+ mptcp_for_each_subflow(msk, subflow) {
+ if (__mptcp_subflow_active(subflow)) {
+ if (!subflow->stale) {
+ mptcp_pm_send_ack(msk, subflow, false, false);
+ return;
+ }
+
+ if (!alt)
+ alt = subflow;
+ }
+ }
+
+ if (alt)
+ mptcp_pm_send_ack(msk, alt, false, false);
}
int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
@@ -746,7 +815,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
{
struct mptcp_subflow_context *subflow;
- pr_debug("bkup=%d", bkup);
+ pr_debug("bkup=%d\n", bkup);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -769,11 +838,6 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
return -EINVAL;
}
-static bool mptcp_local_id_match(const struct mptcp_sock *msk, u8 local_id, u8 id)
-{
- return local_id == id || (!local_id && msk->mpc_endpoint_id == id);
-}
-
static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list,
enum linux_mptcp_mib_field rm_type)
@@ -782,7 +846,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
struct sock *sk = (struct sock *)msk;
u8 i;
- pr_debug("%s rm_list_nr %d",
+ pr_debug("%s rm_list_nr %d\n",
rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr);
msk_owned_by_me(msk);
@@ -806,37 +870,45 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
u8 id = subflow_get_local_id(subflow);
+ if ((1 << inet_sk_state_load(ssk)) &
+ (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE))
+ continue;
if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id)
continue;
- if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id))
+ if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id)
continue;
- pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u",
+ pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n",
rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow",
i, rm_id, id, remote_id, msk->mpc_endpoint_id);
spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how);
+ removed |= subflow->request_join;
/* the following takes care of updating the subflows counter */
mptcp_close_ssk(sk, ssk, subflow);
spin_lock_bh(&msk->pm.lock);
- removed = true;
- __MPTCP_INC_STATS(sock_net(sk), rm_type);
+ if (rm_type == MPTCP_MIB_RMSUBFLOW)
+ __MPTCP_INC_STATS(sock_net(sk), rm_type);
}
- if (rm_type == MPTCP_MIB_RMSUBFLOW)
- __set_bit(rm_id ? rm_id : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap);
+
+ if (rm_type == MPTCP_MIB_RMADDR)
+ __MPTCP_INC_STATS(sock_net(sk), rm_type);
+
if (!removed)
continue;
if (!mptcp_pm_is_kernel(msk))
continue;
- if (rm_type == MPTCP_MIB_RMADDR) {
- msk->pm.add_addr_accepted--;
- WRITE_ONCE(msk->pm.accept_addr, true);
- } else if (rm_type == MPTCP_MIB_RMSUBFLOW) {
- msk->pm.local_addr_used--;
+ if (rm_type == MPTCP_MIB_RMADDR && rm_id &&
+ !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) {
+ /* Note: if the subflow has been closed before, this
+ * add_addr_accepted counter will not be decremented.
+ */
+ if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk))
+ WRITE_ONCE(msk->pm.accept_addr, true);
}
}
}
@@ -846,8 +918,8 @@ static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR);
}
-void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
- const struct mptcp_rm_list *rm_list)
+static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list)
{
mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW);
}
@@ -863,7 +935,7 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk)
spin_lock_bh(&msk->pm.lock);
- pr_debug("msk=%p status=%x", msk, pm->status);
+ pr_debug("msk=%p status=%x\n", msk, pm->status);
if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
mptcp_pm_nl_add_addr_received(msk);
@@ -905,7 +977,7 @@ static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
struct mptcp_pm_addr_entry *entry,
- bool needs_id)
+ bool needs_id, bool replace)
{
struct mptcp_pm_addr_entry *cur, *del_entry = NULL;
unsigned int addr_max;
@@ -945,6 +1017,17 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
if (entry->addr.id)
goto out;
+ /* allow callers that only need to look up the local
+ * addr's id to skip replacement. This allows them to
+ * avoid calling synchronize_rcu in the packet recv
+ * path.
+ */
+ if (!replace) {
+ kfree(entry);
+ ret = cur->addr.id;
+ goto out;
+ }
+
pernet->addrs--;
entry->addr.id = cur->addr.id;
list_del_rcu(&cur->list);
@@ -1059,6 +1142,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
*/
inet_sk_state_store(newsk, TCP_LISTEN);
lock_sock(ssk);
+ WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true);
err = __inet_listen_sk(ssk, backlog);
if (!err)
mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
@@ -1070,17 +1154,13 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc
{
struct mptcp_pm_addr_entry *entry;
struct pm_nl_pernet *pernet;
- int ret = -1;
+ int ret;
pernet = pm_nl_get_pernet_from_msk(msk);
rcu_read_lock();
- list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) {
- ret = entry->addr.id;
- break;
- }
- }
+ entry = __lookup_addr(pernet, skc);
+ ret = entry ? entry->addr.id : -1;
rcu_read_unlock();
if (ret >= 0)
return ret;
@@ -1096,13 +1176,27 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc
entry->ifindex = 0;
entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
entry->lsk = NULL;
- ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true);
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false);
if (ret < 0)
kfree(entry);
return ret;
}
+bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+ struct mptcp_pm_addr_entry *entry;
+ bool backup;
+
+ rcu_read_lock();
+ entry = __lookup_addr(pernet, skc);
+ backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ rcu_read_unlock();
+
+ return backup;
+}
+
#define MPTCP_PM_CMD_GRP_OFFSET 0
#define MPTCP_PM_EV_GRP_OFFSET 1
@@ -1263,20 +1357,27 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
return pm_nl_get_pernet(genl_info_net(info));
}
-static int mptcp_nl_add_subflow_or_signal_addr(struct net *net)
+static int mptcp_nl_add_subflow_or_signal_addr(struct net *net,
+ struct mptcp_addr_info *addr)
{
struct mptcp_sock *msk;
long s_slot = 0, s_num = 0;
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
+ struct mptcp_addr_info mpc_addr;
if (!READ_ONCE(msk->fully_established) ||
mptcp_pm_is_userspace(msk))
goto next;
+ /* if the endp linked to the init sf is re-added with a != ID */
+ mptcp_local_address((struct sock_common *)msk, &mpc_addr);
+
lock_sock(sk);
spin_lock_bh(&msk->pm.lock);
+ if (mptcp_addresses_equal(addr, &mpc_addr, addr->port))
+ msk->mpc_endpoint_id = addr->id;
mptcp_pm_create_subflow_or_signal_addr(msk);
spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
@@ -1312,8 +1413,8 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info)
if (ret < 0)
return ret;
- if (addr.addr.port && !(addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
- GENL_SET_ERR_MSG(info, "flags must have signal when using port");
+ if (addr.addr.port && !address_use_port(&addr)) {
+ GENL_SET_ERR_MSG(info, "flags must have signal and not subflow when using port");
return -EINVAL;
}
@@ -1343,13 +1444,14 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info)
}
}
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry,
- !mptcp_pm_has_addr_attr_id(attr, info));
+ !mptcp_pm_has_addr_attr_id(attr, info),
+ true);
if (ret < 0) {
GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret);
goto out_free;
}
- mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk));
+ mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr);
return 0;
out_free:
@@ -1357,32 +1459,13 @@ out_free:
return ret;
}
-int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
- u8 *flags, int *ifindex)
-{
- struct mptcp_pm_addr_entry *entry;
- struct sock *sk = (struct sock *)msk;
- struct net *net = sock_net(sk);
-
- rcu_read_lock();
- entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id);
- if (entry) {
- *flags = entry->flags;
- *ifindex = entry->ifindex;
- }
- rcu_read_unlock();
-
- return 0;
-}
-
-static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr)
+bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
{
struct mptcp_pm_add_entry *entry;
entry = mptcp_pm_del_add_timer(msk, addr, false);
if (entry) {
- list_del(&entry->list);
kfree(entry);
return true;
}
@@ -1390,6 +1473,12 @@ static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
return false;
}
+static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ return msk->mpc_endpoint_id == addr->id ? 0 : addr->id;
+}
+
static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool force)
@@ -1397,28 +1486,38 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
struct mptcp_rm_list list = { .nr = 0 };
bool ret;
- list.ids[list.nr++] = addr->id;
+ list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr);
- ret = remove_anno_list_by_saddr(msk, addr);
+ ret = mptcp_remove_anno_list_by_saddr(msk, addr);
if (ret || force) {
spin_lock_bh(&msk->pm.lock);
+ if (ret) {
+ __set_bit(addr->id, msk->pm.id_avail_bitmap);
+ msk->pm.add_addr_signaled--;
+ }
mptcp_pm_remove_addr(msk, &list);
spin_unlock_bh(&msk->pm.lock);
}
return ret;
}
+static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id)
+{
+ /* If it was marked as used, and not ID 0, decrement local_addr_used */
+ if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) &&
+ id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0))
+ msk->pm.local_addr_used--;
+}
+
static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
const struct mptcp_pm_addr_entry *entry)
{
const struct mptcp_addr_info *addr = &entry->addr;
- struct mptcp_rm_list list = { .nr = 0 };
+ struct mptcp_rm_list list = { .nr = 1 };
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
- pr_debug("remove_id=%d", addr->id);
-
- list.ids[list.nr++] = addr->id;
+ pr_debug("remove_id=%d\n", addr->id);
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
@@ -1427,17 +1526,26 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
if (mptcp_pm_is_userspace(msk))
goto next;
- if (list_empty(&msk->conn_list)) {
- mptcp_pm_remove_anno_addr(msk, addr, false);
- goto next;
- }
-
lock_sock(sk);
- remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr);
+ remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr);
mptcp_pm_remove_anno_addr(msk, addr, remove_subflow &&
!(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT));
- if (remove_subflow)
- mptcp_pm_remove_subflow(msk, &list);
+
+ list.ids[0] = mptcp_endp_get_local_id(msk, addr);
+ if (remove_subflow) {
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_nl_rm_subflow_received(msk, &list);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ spin_lock_bh(&msk->pm.lock);
+ __mark_subflow_endp_available(msk, list.ids[0]);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+
+ if (msk->mpc_endpoint_id == entry->addr.id)
+ msk->mpc_endpoint_id = 0;
release_sock(sk);
next:
@@ -1472,6 +1580,7 @@ static int mptcp_nl_remove_id_zero_address(struct net *net,
spin_lock_bh(&msk->pm.lock);
mptcp_pm_remove_addr(msk, &list);
mptcp_pm_nl_rm_subflow_received(msk, &list);
+ __mark_subflow_endp_available(msk, 0);
spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
@@ -1531,52 +1640,37 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
return ret;
}
-void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list)
-{
- struct mptcp_rm_list alist = { .nr = 0 };
- struct mptcp_pm_addr_entry *entry;
-
- list_for_each_entry(entry, rm_list, list) {
- if ((remove_anno_list_by_saddr(msk, &entry->addr) ||
- lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) &&
- alist.nr < MPTCP_RM_IDS_MAX)
- alist.ids[alist.nr++] = entry->addr.id;
- }
-
- if (alist.nr) {
- spin_lock_bh(&msk->pm.lock);
- mptcp_pm_remove_addr(msk, &alist);
- spin_unlock_bh(&msk->pm.lock);
- }
-}
-
-void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
- struct list_head *rm_list)
+static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk,
+ struct list_head *rm_list)
{
struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 };
struct mptcp_pm_addr_entry *entry;
list_for_each_entry(entry, rm_list, list) {
- if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
- slist.nr < MPTCP_RM_IDS_MAX)
- slist.ids[slist.nr++] = entry->addr.id;
+ if (slist.nr < MPTCP_RM_IDS_MAX &&
+ mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr))
+ slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr);
- if (remove_anno_list_by_saddr(msk, &entry->addr) &&
- alist.nr < MPTCP_RM_IDS_MAX)
- alist.ids[alist.nr++] = entry->addr.id;
+ if (alist.nr < MPTCP_RM_IDS_MAX &&
+ mptcp_remove_anno_list_by_saddr(msk, &entry->addr))
+ alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr);
}
+ spin_lock_bh(&msk->pm.lock);
if (alist.nr) {
- spin_lock_bh(&msk->pm.lock);
+ msk->pm.add_addr_signaled -= alist.nr;
mptcp_pm_remove_addr(msk, &alist);
- spin_unlock_bh(&msk->pm.lock);
}
if (slist.nr)
- mptcp_pm_remove_subflow(msk, &slist);
+ mptcp_pm_nl_rm_subflow_received(msk, &slist);
+ /* Reset counters: maybe some subflows have been removed before */
+ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ msk->pm.local_addr_used = 0;
+ spin_unlock_bh(&msk->pm.lock);
}
-static void mptcp_nl_remove_addrs_list(struct net *net,
- struct list_head *rm_list)
+static void mptcp_nl_flush_addrs_list(struct net *net,
+ struct list_head *rm_list)
{
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
@@ -1589,7 +1683,7 @@ static void mptcp_nl_remove_addrs_list(struct net *net,
if (!mptcp_pm_is_userspace(msk)) {
lock_sock(sk);
- mptcp_pm_remove_addrs_and_subflows(msk, rm_list);
+ mptcp_pm_flush_addrs_and_subflows(msk, rm_list);
release_sock(sk);
}
@@ -1630,14 +1724,14 @@ int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info)
pernet->next_id = 1;
bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
spin_unlock_bh(&pernet->lock);
- mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list);
+ mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list);
synchronize_rcu();
__flush_addrs(&free_list);
return 0;
}
-static int mptcp_nl_fill_addr(struct sk_buff *skb,
- struct mptcp_pm_addr_entry *entry)
+int mptcp_nl_fill_addr(struct sk_buff *skb,
+ struct mptcp_pm_addr_entry *entry)
{
struct mptcp_addr_info *addr = &entry->addr;
struct nlattr *attr;
@@ -1675,7 +1769,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info)
+int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
@@ -1700,7 +1794,7 @@ int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info)
goto fail;
}
- spin_lock_bh(&pernet->lock);
+ rcu_read_lock();
entry = __lookup_addr_by_id(pernet, addr.addr.id);
if (!entry) {
GENL_SET_ERR_MSG(info, "address not found");
@@ -1714,19 +1808,24 @@ int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info)
genlmsg_end(msg, reply);
ret = genlmsg_reply(msg, info);
- spin_unlock_bh(&pernet->lock);
+ rcu_read_unlock();
return ret;
unlock_fail:
- spin_unlock_bh(&pernet->lock);
+ rcu_read_unlock();
fail:
nlmsg_free(msg);
return ret;
}
-int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg,
- struct netlink_callback *cb)
+int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ return mptcp_pm_get_addr(skb, info);
+}
+
+int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
+ struct netlink_callback *cb)
{
struct net *net = sock_net(msg->sk);
struct mptcp_pm_addr_entry *entry;
@@ -1737,7 +1836,7 @@ int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg,
pernet = pm_nl_get_pernet(net);
- spin_lock_bh(&pernet->lock);
+ rcu_read_lock();
for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
if (test_bit(i, pernet->id_bitmap)) {
entry = __lookup_addr_by_id(pernet, i);
@@ -1762,12 +1861,18 @@ int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg,
genlmsg_end(msg, hdr);
}
}
- spin_unlock_bh(&pernet->lock);
+ rcu_read_unlock();
cb->args[0] = id;
return msg->len;
}
+int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ return mptcp_pm_dump_addr(msg, cb);
+}
+
static int parse_limit(struct genl_info *info, int id, unsigned int *limit)
{
struct nlattr *attr = info->attrs[id];
@@ -1845,10 +1950,11 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk,
{
struct mptcp_rm_list list = { .nr = 0 };
- list.ids[list.nr++] = addr->id;
+ list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr);
spin_lock_bh(&msk->pm.lock);
mptcp_pm_nl_rm_subflow_received(msk, &list);
+ __mark_subflow_endp_available(msk, list.ids[0]);
mptcp_pm_create_subflow_or_signal_addr(msk);
spin_unlock_bh(&msk->pm.lock);
}
@@ -1882,66 +1988,64 @@ next:
return ret;
}
-int mptcp_pm_nl_set_flags(struct net *net, struct mptcp_pm_addr_entry *addr, u8 bkup)
+int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info)
{
- struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
+ struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, };
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP |
MPTCP_PM_ADDR_FLAG_FULLMESH;
+ struct net *net = sock_net(skb->sk);
struct mptcp_pm_addr_entry *entry;
+ struct pm_nl_pernet *pernet;
u8 lookup_by_id = 0;
+ u8 bkup = 0;
+ int ret;
+
+ pernet = pm_nl_get_pernet(net);
+
+ ret = mptcp_pm_parse_entry(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
- if (addr->addr.family == AF_UNSPEC) {
+ if (addr.addr.family == AF_UNSPEC) {
lookup_by_id = 1;
- if (!addr->addr.id)
+ if (!addr.addr.id) {
+ GENL_SET_ERR_MSG(info, "missing required inputs");
return -EOPNOTSUPP;
+ }
}
+ if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
+ bkup = 1;
+
spin_lock_bh(&pernet->lock);
- entry = __lookup_addr(pernet, &addr->addr, lookup_by_id);
+ entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) :
+ __lookup_addr(pernet, &addr.addr);
if (!entry) {
spin_unlock_bh(&pernet->lock);
+ GENL_SET_ERR_MSG(info, "address not found");
return -EINVAL;
}
- if ((addr->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
- (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+ if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
+ (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL |
+ MPTCP_PM_ADDR_FLAG_IMPLICIT))) {
spin_unlock_bh(&pernet->lock);
+ GENL_SET_ERR_MSG(info, "invalid addr flags");
return -EINVAL;
}
- changed = (addr->flags ^ entry->flags) & mask;
- entry->flags = (entry->flags & ~mask) | (addr->flags & mask);
- *addr = *entry;
+ changed = (addr.flags ^ entry->flags) & mask;
+ entry->flags = (entry->flags & ~mask) | (addr.flags & mask);
+ addr = *entry;
spin_unlock_bh(&pernet->lock);
- mptcp_nl_set_flags(net, &addr->addr, bkup, changed);
+ mptcp_nl_set_flags(net, &addr.addr, bkup, changed);
return 0;
}
int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info)
{
- struct mptcp_pm_addr_entry remote = { .addr = { .family = AF_UNSPEC }, };
- struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, };
- struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
- struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct net *net = sock_net(skb->sk);
- u8 bkup = 0;
- int ret;
-
- ret = mptcp_pm_parse_entry(attr, info, false, &addr);
- if (ret < 0)
- return ret;
-
- if (attr_rem) {
- ret = mptcp_pm_parse_entry(attr_rem, info, false, &remote);
- if (ret < 0)
- return ret;
- }
-
- if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
- bkup = 1;
-
- return mptcp_pm_set_flags(net, token, &addr, &remote, bkup);
+ return mptcp_pm_set_flags(skb, info);
}
static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp)
@@ -2014,7 +2118,7 @@ static int mptcp_event_put_token_and_ssk(struct sk_buff *skb,
const struct mptcp_subflow_context *sf;
u8 sk_err;
- if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)))
return -EMSGSIZE;
if (mptcp_event_add_subflow(skb, ssk))
@@ -2072,7 +2176,7 @@ static int mptcp_event_created(struct sk_buff *skb,
const struct mptcp_sock *msk,
const struct sock *ssk)
{
- int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token);
+ int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token));
if (err)
return err;
@@ -2100,7 +2204,7 @@ void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id)
if (!nlh)
goto nla_put_failure;
- if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)))
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id))
@@ -2135,7 +2239,7 @@ void mptcp_event_addr_announced(const struct sock *ssk,
if (!nlh)
goto nla_put_failure;
- if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)))
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id))
@@ -2251,7 +2355,7 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
goto nla_put_failure;
break;
case MPTCP_EVENT_CLOSED:
- if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token) < 0)
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)) < 0)
goto nla_put_failure;
break;
case MPTCP_EVENT_ANNOUNCED:
@@ -2281,7 +2385,7 @@ nla_put_failure:
nlmsg_free(skb);
}
-static struct genl_family mptcp_genl_family __ro_after_init = {
+struct genl_family mptcp_genl_family __ro_after_init = {
.name = MPTCP_PM_NAME,
.version = MPTCP_PM_VER,
.netnsok = true,
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index bc97cc30f013..a3d477059b11 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -6,6 +6,11 @@
#include "protocol.h"
#include "mib.h"
+#include "mptcp_pm_gen.h"
+
+#define mptcp_for_each_userspace_pm_addr(__msk, __entry) \
+ list_for_each_entry(__entry, \
+ &((__msk)->pm.userspace_pm_local_addr_list), list)
void mptcp_free_local_addr_list(struct mptcp_sock *msk)
{
@@ -25,6 +30,19 @@ void mptcp_free_local_addr_list(struct mptcp_sock *msk)
}
}
+static struct mptcp_pm_addr_entry *
+mptcp_userspace_pm_lookup_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ mptcp_for_each_userspace_pm_addr(msk, entry) {
+ if (mptcp_addresses_equal(&entry->addr, addr, false))
+ return entry;
+ }
+ return NULL;
+}
+
static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
struct mptcp_pm_addr_entry *entry,
bool needs_id)
@@ -40,7 +58,7 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) {
+ mptcp_for_each_userspace_pm_addr(msk, e) {
addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true);
if (addr_match && entry->addr.id == 0 && needs_id)
entry->addr.id = e->addr.id;
@@ -89,59 +107,43 @@ append_err:
static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk,
struct mptcp_pm_addr_entry *addr)
{
- struct mptcp_pm_addr_entry *entry, *tmp;
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *entry;
- list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) {
- if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) {
- /* TODO: a refcount is needed because the entry can
- * be used multiple times (e.g. fullmesh mode).
- */
- list_del_rcu(&entry->list);
- kfree(entry);
- msk->pm.local_addr_used--;
- return 0;
- }
- }
+ entry = mptcp_userspace_pm_lookup_addr(msk, &addr->addr);
+ if (!entry)
+ return -EINVAL;
- return -EINVAL;
+ /* TODO: a refcount is needed because the entry can
+ * be used multiple times (e.g. fullmesh mode).
+ */
+ list_del_rcu(&entry->list);
+ sock_kfree_s(sk, entry, sizeof(*entry));
+ msk->pm.local_addr_used--;
+ return 0;
}
-int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
- unsigned int id,
- u8 *flags, int *ifindex)
+static struct mptcp_pm_addr_entry *
+mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id)
{
- struct mptcp_pm_addr_entry *entry, *match = NULL;
+ struct mptcp_pm_addr_entry *entry;
- spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
- if (id == entry->addr.id) {
- match = entry;
- break;
- }
- }
- spin_unlock_bh(&msk->pm.lock);
- if (match) {
- *flags = match->flags;
- *ifindex = match->ifindex;
+ mptcp_for_each_userspace_pm_addr(msk, entry) {
+ if (entry->addr.id == id)
+ return entry;
}
-
- return 0;
+ return NULL;
}
int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk,
struct mptcp_addr_info *skc)
{
- struct mptcp_pm_addr_entry *entry = NULL, *e, new_entry;
+ struct mptcp_pm_addr_entry *entry = NULL, new_entry;
__be16 msk_sport = ((struct inet_sock *)
inet_sk((struct sock *)msk))->inet_sport;
spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) {
- if (mptcp_addresses_equal(&e->addr, skc, false)) {
- entry = e;
- break;
- }
- }
+ entry = mptcp_userspace_pm_lookup_addr(msk, skc);
spin_unlock_bh(&msk->pm.lock);
if (entry)
return entry->addr.id;
@@ -157,36 +159,64 @@ int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk,
return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true);
}
-int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info)
+bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk,
+ struct mptcp_addr_info *skc)
+{
+ struct mptcp_pm_addr_entry *entry;
+ bool backup;
+
+ spin_lock_bh(&msk->pm.lock);
+ entry = mptcp_userspace_pm_lookup_addr(msk, skc);
+ backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ spin_unlock_bh(&msk->pm.lock);
+
+ return backup;
+}
+
+static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *info)
{
struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct mptcp_sock *msk;
+
+ if (!token) {
+ GENL_SET_ERR_MSG(info, "missing required token");
+ return NULL;
+ }
+
+ msk = mptcp_token_get_sock(genl_info_net(info), nla_get_u32(token));
+ if (!msk) {
+ NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ return NULL;
+ }
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ sock_put((struct sock *)msk);
+ return NULL;
+ }
+
+ return msk;
+}
+
+int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info)
+{
struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct mptcp_pm_addr_entry addr_val;
struct mptcp_sock *msk;
int err = -EINVAL;
struct sock *sk;
- u32 token_val;
- if (!addr || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (!addr) {
+ GENL_SET_ERR_MSG(info, "missing required address");
return err;
}
- token_val = nla_get_u32(token);
-
- msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto announce_err;
- }
-
err = mptcp_pm_parse_entry(addr, info, true, &addr_val);
if (err < 0) {
GENL_SET_ERR_MSG(info, "error parsing local address");
@@ -257,40 +287,48 @@ remove_err:
return err;
}
+void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_rm_list alist = { .nr = 0 };
+ int anno_nr = 0;
+
+ /* only delete if either announced or matching a subflow */
+ if (mptcp_remove_anno_list_by_saddr(msk, &entry->addr))
+ anno_nr++;
+ else if (!mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr))
+ return;
+
+ alist.ids[alist.nr++] = entry->addr.id;
+
+ spin_lock_bh(&msk->pm.lock);
+ msk->pm.add_addr_signaled -= anno_nr;
+ mptcp_pm_remove_addr(msk, &alist);
+ spin_unlock_bh(&msk->pm.lock);
+}
+
int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID];
- struct mptcp_pm_addr_entry *match = NULL;
- struct mptcp_pm_addr_entry *entry;
+ struct mptcp_pm_addr_entry *match;
struct mptcp_sock *msk;
- LIST_HEAD(free_list);
int err = -EINVAL;
struct sock *sk;
- u32 token_val;
u8 id_val;
- if (!id || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (!id) {
+ GENL_SET_ERR_MSG(info, "missing required ID");
return err;
}
id_val = nla_get_u8(id);
- token_val = nla_get_u32(token);
- msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto out;
- }
-
if (id_val == 0) {
err = mptcp_userspace_pm_remove_id_zero_address(msk, info);
goto out;
@@ -298,28 +336,23 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info)
lock_sock(sk);
- list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
- if (entry->addr.id == id_val) {
- match = entry;
- break;
- }
- }
-
+ spin_lock_bh(&msk->pm.lock);
+ match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val);
if (!match) {
GENL_SET_ERR_MSG(info, "address with specified id not found");
+ spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
goto out;
}
- list_move(&match->list, &free_list);
+ list_del_rcu(&match->list);
+ spin_unlock_bh(&msk->pm.lock);
- mptcp_pm_remove_addrs(msk, &free_list);
+ mptcp_pm_remove_addr_entry(msk, match);
release_sock(sk);
- list_for_each_entry_safe(match, entry, &free_list, list) {
- sock_kfree_s(sk, match, sizeof(*match));
- }
+ sock_kfree_s(sk, match, sizeof(*match));
err = 0;
out:
@@ -330,41 +363,37 @@ out:
int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct mptcp_pm_addr_entry local = { 0 };
+ struct mptcp_pm_addr_entry entry = { 0 };
struct mptcp_addr_info addr_r;
- struct mptcp_addr_info addr_l;
+ struct mptcp_pm_local local;
struct mptcp_sock *msk;
int err = -EINVAL;
struct sock *sk;
- u32 token_val;
- if (!laddr || !raddr || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (!laddr || !raddr) {
+ GENL_SET_ERR_MSG(info, "missing required address(es)");
return err;
}
- token_val = nla_get_u32(token);
-
- msk = mptcp_token_get_sock(genl_info_net(info), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ err = mptcp_pm_parse_entry(laddr, info, true, &entry);
+ if (err < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
goto create_err;
}
- err = mptcp_pm_parse_addr(laddr, info, &addr_l);
- if (err < 0) {
- NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
+ if (entry.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ GENL_SET_ERR_MSG(info, "invalid addr flags");
+ err = -EINVAL;
goto create_err;
}
+ entry.flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW;
err = mptcp_pm_parse_addr(raddr, info, &addr_r);
if (err < 0) {
@@ -372,28 +401,29 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info)
goto create_err;
}
- if (!mptcp_pm_addr_families_match(sk, &addr_l, &addr_r)) {
+ if (!mptcp_pm_addr_families_match(sk, &entry.addr, &addr_r)) {
GENL_SET_ERR_MSG(info, "families mismatch");
err = -EINVAL;
goto create_err;
}
- local.addr = addr_l;
- err = mptcp_userspace_pm_append_new_local_addr(msk, &local, false);
+ err = mptcp_userspace_pm_append_new_local_addr(msk, &entry, false);
if (err < 0) {
GENL_SET_ERR_MSG(info, "did not match address and id");
goto create_err;
}
- lock_sock(sk);
-
- err = __mptcp_subflow_connect(sk, &addr_l, &addr_r);
+ local.addr = entry.addr;
+ local.flags = entry.flags;
+ local.ifindex = entry.ifindex;
+ lock_sock(sk);
+ err = __mptcp_subflow_connect(sk, &local, &addr_r);
release_sock(sk);
spin_lock_bh(&msk->pm.lock);
if (err)
- mptcp_userspace_pm_delete_local_addr(msk, &local);
+ mptcp_userspace_pm_delete_local_addr(msk, &entry);
else
msk->pm.subflows++;
spin_unlock_bh(&msk->pm.lock);
@@ -454,36 +484,25 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk,
int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct mptcp_addr_info addr_l;
+ struct mptcp_pm_addr_entry addr_l;
struct mptcp_addr_info addr_r;
struct mptcp_sock *msk;
struct sock *sk, *ssk;
int err = -EINVAL;
- u32 token_val;
- if (!laddr || !raddr || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (!laddr || !raddr) {
+ GENL_SET_ERR_MSG(info, "missing required address(es)");
return err;
}
- token_val = nla_get_u32(token);
-
- msk = mptcp_token_get_sock(genl_info_net(info), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto destroy_err;
- }
-
- err = mptcp_pm_parse_addr(laddr, info, &addr_l);
+ err = mptcp_pm_parse_entry(laddr, info, true, &addr_l);
if (err < 0) {
NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
goto destroy_err;
@@ -496,43 +515,41 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if (addr_l.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) {
- ipv6_addr_set_v4mapped(addr_l.addr.s_addr, &addr_l.addr6);
- addr_l.family = AF_INET6;
+ if (addr_l.addr.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) {
+ ipv6_addr_set_v4mapped(addr_l.addr.addr.s_addr, &addr_l.addr.addr6);
+ addr_l.addr.family = AF_INET6;
}
- if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr6)) {
+ if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr.addr6)) {
ipv6_addr_set_v4mapped(addr_r.addr.s_addr, &addr_r.addr6);
addr_r.family = AF_INET6;
}
#endif
- if (addr_l.family != addr_r.family) {
+ if (addr_l.addr.family != addr_r.family) {
GENL_SET_ERR_MSG(info, "address families do not match");
err = -EINVAL;
goto destroy_err;
}
- if (!addr_l.port || !addr_r.port) {
+ if (!addr_l.addr.port || !addr_r.port) {
GENL_SET_ERR_MSG(info, "missing local or remote port");
err = -EINVAL;
goto destroy_err;
}
lock_sock(sk);
- ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r);
- if (ssk) {
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- struct mptcp_pm_addr_entry entry = { .addr = addr_l };
-
- spin_lock_bh(&msk->pm.lock);
- mptcp_userspace_pm_delete_local_addr(msk, &entry);
- spin_unlock_bh(&msk->pm.lock);
- mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN);
- mptcp_close_ssk(sk, ssk, subflow);
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
- err = 0;
- } else {
+ ssk = mptcp_nl_find_ssk(msk, &addr_l.addr, &addr_r);
+ if (!ssk) {
err = -ESRCH;
+ goto release_sock;
}
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_userspace_pm_delete_local_addr(msk, &addr_l);
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN);
+ mptcp_close_ssk(sk, ssk, mptcp_subflow_ctx(ssk));
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
+release_sock:
release_sock(sk);
destroy_err:
@@ -540,35 +557,173 @@ destroy_err:
return err;
}
-int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token,
- struct mptcp_pm_addr_entry *loc,
- struct mptcp_pm_addr_entry *rem, u8 bkup)
+int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info)
{
+ struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, };
+ struct mptcp_pm_addr_entry rem = { .addr = { .family = AF_UNSPEC }, };
+ struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct mptcp_pm_addr_entry *entry;
struct mptcp_sock *msk;
int ret = -EINVAL;
struct sock *sk;
- u32 token_val;
+ u8 bkup = 0;
- token_val = nla_get_u32(token);
-
- msk = mptcp_token_get_sock(net, token_val);
+ msk = mptcp_userspace_pm_get_sock(info);
if (!msk)
return ret;
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk))
+ ret = mptcp_pm_parse_entry(attr, info, false, &loc);
+ if (ret < 0)
goto set_flags_err;
- if (loc->addr.family == AF_UNSPEC ||
- rem->addr.family == AF_UNSPEC)
+ if (attr_rem) {
+ ret = mptcp_pm_parse_entry(attr_rem, info, false, &rem);
+ if (ret < 0)
+ goto set_flags_err;
+ }
+
+ if (loc.addr.family == AF_UNSPEC ||
+ rem.addr.family == AF_UNSPEC) {
+ GENL_SET_ERR_MSG(info, "invalid address families");
+ ret = -EINVAL;
goto set_flags_err;
+ }
+
+ if (loc.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
+ bkup = 1;
+
+ spin_lock_bh(&msk->pm.lock);
+ entry = mptcp_userspace_pm_lookup_addr(msk, &loc.addr);
+ if (entry) {
+ if (bkup)
+ entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
+ else
+ entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
+ }
+ spin_unlock_bh(&msk->pm.lock);
lock_sock(sk);
- ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup);
+ ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc.addr, &rem.addr, bkup);
release_sock(sk);
set_flags_err:
sock_put(sk);
return ret;
}
+
+int mptcp_userspace_pm_dump_addr(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct id_bitmap {
+ DECLARE_BITMAP(map, MPTCP_PM_MAX_ADDR_ID + 1);
+ } *bitmap;
+ const struct genl_info *info = genl_info_dump(cb);
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_sock *msk;
+ int ret = -EINVAL;
+ struct sock *sk;
+ void *hdr;
+
+ bitmap = (struct id_bitmap *)cb->ctx;
+
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
+ return ret;
+
+ sk = (struct sock *)msk;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_for_each_userspace_pm_addr(msk, entry) {
+ if (test_bit(entry->addr.id, bitmap->map))
+ continue;
+
+ hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &mptcp_genl_family,
+ NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
+ if (!hdr)
+ break;
+
+ if (mptcp_nl_fill_addr(msg, entry) < 0) {
+ genlmsg_cancel(msg, hdr);
+ break;
+ }
+
+ __set_bit(entry->addr.id, bitmap->map);
+ genlmsg_end(msg, hdr);
+ }
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+ ret = msg->len;
+
+ sock_put(sk);
+ return ret;
+}
+
+int mptcp_userspace_pm_get_addr(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
+ struct mptcp_pm_addr_entry addr, *entry;
+ struct mptcp_sock *msk;
+ struct sk_buff *msg;
+ int ret = -EINVAL;
+ struct sock *sk;
+ void *reply;
+
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
+ return ret;
+
+ sk = (struct sock *)msk;
+
+ ret = mptcp_pm_parse_entry(attr, info, false, &addr);
+ if (ret < 0)
+ goto out;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
+ info->genlhdr->cmd);
+ if (!reply) {
+ GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
+ ret = -EMSGSIZE;
+ goto fail;
+ }
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ entry = mptcp_userspace_pm_lookup_addr_by_id(msk, addr.addr.id);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "address not found");
+ ret = -EINVAL;
+ goto unlock_fail;
+ }
+
+ ret = mptcp_nl_fill_addr(msg, entry);
+ if (ret)
+ goto unlock_fail;
+
+ genlmsg_end(msg, reply);
+ ret = genlmsg_reply(msg, info);
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+ sock_put(sk);
+ return ret;
+
+unlock_fail:
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+fail:
+ nlmsg_free(msg);
+out:
+ sock_put(sk);
+ return ret;
+}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 7833a49f6214..6bd819047470 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -15,12 +15,12 @@
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
#include <net/protocol.h>
-#include <net/tcp.h>
#include <net/tcp_states.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/transp_v6.h>
#endif
#include <net/mptcp.h>
+#include <net/hotdata.h>
#include <net/xfrm.h>
#include <asm/ioctls.h>
#include "protocol.h"
@@ -47,7 +47,7 @@ static void __mptcp_destroy_sock(struct sock *sk);
static void mptcp_check_send_data_fin(struct sock *sk);
DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
-static struct net_device mptcp_napi_dev;
+static struct net_device *mptcp_napi_dev;
/* Returns end sequence number of the receiver's advertised window */
static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
@@ -136,10 +136,11 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
int delta;
if (MPTCP_SKB_CB(from)->offset ||
+ ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) ||
!skb_try_coalesce(to, from, &fragstolen, &delta))
return false;
- pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
+ pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n",
MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
to->len, MPTCP_SKB_CB(from)->end_seq);
MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
@@ -217,7 +218,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
end_seq = MPTCP_SKB_CB(skb)->end_seq;
max_seq = atomic64_read(&msk->rcv_wnd_sent);
- pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
+ pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq,
RB_EMPTY_ROOT(&msk->out_of_order_queue));
if (after64(end_seq, max_seq)) {
/* out of window */
@@ -350,8 +351,10 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
skb_orphan(skb);
/* try to fetch required memory from subflow */
- if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
+ if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
goto drop;
+ }
has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
@@ -410,6 +413,7 @@ static void mptcp_close_wake_up(struct sock *sk)
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}
+/* called under the msk socket lock */
static bool mptcp_pending_data_fin_ack(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -441,16 +445,17 @@ static void mptcp_check_data_fin_ack(struct sock *sk)
}
}
+/* can be called with no lock acquired */
static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
{
struct mptcp_sock *msk = mptcp_sk(sk);
if (READ_ONCE(msk->rcv_data_fin) &&
- ((1 << sk->sk_state) &
+ ((1 << inet_sk_state_load(sk)) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) {
u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq);
- if (msk->ack_seq == rcv_data_fin_seq) {
+ if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) {
if (seq)
*seq = rcv_data_fin_seq;
@@ -524,13 +529,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk)
mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow));
}
-static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
+static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied)
{
bool slow;
slow = lock_sock_fast(ssk);
if (tcp_can_send_ack(ssk))
- tcp_cleanup_rbuf(ssk, 1);
+ tcp_cleanup_rbuf(ssk, copied);
unlock_sock_fast(ssk, slow);
}
@@ -547,7 +552,7 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
(ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED)));
}
-static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
+static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied)
{
int old_space = READ_ONCE(msk->old_wspace);
struct mptcp_subflow_context *subflow;
@@ -555,14 +560,14 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
int space = __mptcp_space(sk);
bool cleanup, rx_empty;
- cleanup = (space > 0) && (space >= (old_space << 1));
- rx_empty = !__mptcp_rmem(sk);
+ cleanup = (space > 0) && (space >= (old_space << 1)) && copied;
+ rx_empty = !__mptcp_rmem(sk) && copied;
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty))
- mptcp_subflow_cleanup_rbuf(ssk);
+ mptcp_subflow_cleanup_rbuf(ssk, copied);
}
}
@@ -616,6 +621,18 @@ static bool mptcp_check_data_fin(struct sock *sk)
return ret;
}
+static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk)
+{
+ if (READ_ONCE(msk->allow_infinite_fallback)) {
+ MPTCP_INC_STATS(sock_net(ssk),
+ MPTCP_MIB_DSSCORRUPTIONFALLBACK);
+ mptcp_do_fallback(ssk);
+ } else {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET);
+ mptcp_subflow_reset(ssk);
+ }
+}
+
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct sock *ssk,
unsigned int *bytes)
@@ -639,7 +656,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
}
}
- pr_debug("msk=%p ssk=%p", msk, ssk);
+ pr_debug("msk=%p ssk=%p\n", msk, ssk);
tp = tcp_sk(ssk);
do {
u32 map_remaining, offset;
@@ -688,10 +705,16 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
moved += len;
seq += len;
- if (WARN_ON_ONCE(map_remaining < len))
- break;
+ if (unlikely(map_remaining < len)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ mptcp_dss_corruption(msk, ssk);
+ }
} else {
- WARN_ON_ONCE(!fin);
+ if (unlikely(!fin)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ mptcp_dss_corruption(msk, ssk);
+ }
+
sk_eat_skb(ssk, skb);
done = true;
}
@@ -705,6 +728,8 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
}
} while (more_data_avail);
+ if (moved > 0)
+ msk->last_data_recv = tcp_jiffies32;
*bytes += moved;
return done;
}
@@ -718,7 +743,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
u64 end_seq;
p = rb_first(&msk->out_of_order_queue);
- pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ pr_debug("msk=%p empty=%d\n", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
while (p) {
skb = rb_to_skb(p);
if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
@@ -740,7 +765,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
/* skip overlapping data, if any */
- pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
+ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d\n",
MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
delta);
MPTCP_SKB_CB(skb)->offset += delta;
@@ -748,7 +773,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
__skb_queue_tail(&sk->sk_receive_queue, skb);
}
msk->bytes_received += end_seq - msk->ack_seq;
- msk->ack_seq = end_seq;
+ WRITE_ONCE(msk->ack_seq, end_seq);
moved = true;
}
return moved;
@@ -840,10 +865,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
sk_rbuf = ssk_rbuf;
/* over limit? can't append more skbs to msk, Also, no need to wake-up*/
- if (__mptcp_rmem(sk) > sk_rbuf) {
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
+ if (__mptcp_rmem(sk) > sk_rbuf)
return;
- }
/* Wake-up the reader only for in-sequence data */
mptcp_data_lock(sk);
@@ -985,6 +1008,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
put_page(dfrag->page);
}
+/* called under both the msk socket lock and the data lock */
static void __mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1033,13 +1057,15 @@ static void __mptcp_clean_una(struct sock *sk)
msk->recovery = false;
out:
- if (snd_una == READ_ONCE(msk->snd_nxt) &&
- snd_una == READ_ONCE(msk->write_seq)) {
+ if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) {
if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
mptcp_stop_rtx_timer(sk);
} else {
mptcp_reset_rtx_timer(sk);
}
+
+ if (mptcp_pending_data_fin_ack(sk))
+ mptcp_schedule_work(sk);
}
static void __mptcp_clean_una_wakeup(struct sock *sk)
@@ -1233,7 +1259,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
size_t copy;
int i;
- pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
+ pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u\n",
msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
if (WARN_ON_ONCE(info->sent > info->limit ||
@@ -1266,7 +1292,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
- if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
+ if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
tcp_mark_push(tcp_sk(ssk), skb);
goto alloc_skb;
}
@@ -1334,7 +1360,7 @@ alloc_skb:
mpext->use_map = 1;
mpext->dsn64 = 1;
- pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d\n",
mpext->data_seq, mpext->subflow_seq, mpext->data_len,
mpext->dsn64);
@@ -1415,13 +1441,15 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
}
mptcp_for_each_subflow(msk, subflow) {
+ bool backup = subflow->backup || subflow->request_bkup;
+
trace_mptcp_subflow_get_send(subflow);
ssk = mptcp_subflow_tcp_sock(subflow);
if (!mptcp_subflow_active(subflow))
continue;
tout = max(tout, mptcp_timeout_from_subflow(subflow));
- nr_active += !subflow->backup;
+ nr_active += !backup;
pace = subflow->avg_pacing_rate;
if (unlikely(!pace)) {
/* init pacing rate from socket */
@@ -1432,9 +1460,9 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
}
linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
- if (linger_time < send_info[subflow->backup].linger_time) {
- send_info[subflow->backup].ssk = ssk;
- send_info[subflow->backup].linger_time = linger_time;
+ if (linger_time < send_info[backup].linger_time) {
+ send_info[backup].ssk = ssk;
+ send_info[backup].linger_time = linger_time;
}
}
__mptcp_set_timeout(sk, tout);
@@ -1500,7 +1528,7 @@ static void mptcp_update_post_push(struct mptcp_sock *msk,
*/
if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
- msk->snd_nxt = snd_nxt_new;
+ WRITE_ONCE(msk->snd_nxt, snd_nxt_new);
}
}
@@ -1552,6 +1580,8 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
err = copied;
out:
+ if (err > 0)
+ msk->last_data_sent = tcp_jiffies32;
return err;
}
@@ -1687,15 +1717,6 @@ out:
}
}
-static void mptcp_set_nospace(struct sock *sk)
-{
- /* enable autotune */
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-
- /* will be cleared on avail space */
- set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
-}
-
static int mptcp_disconnect(struct sock *sk, int flags);
static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
@@ -1746,8 +1767,10 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
* see mptcp_disconnect().
* Attempt it again outside the problematic scope.
*/
- if (!mptcp_disconnect(sk, 0))
+ if (!mptcp_disconnect(sk, 0)) {
+ sk->sk_disconnects++;
sk->sk_socket->state = SS_UNCONNECTED;
+ }
}
inet_clear_bit(DEFER_CONNECT, sk);
@@ -1766,6 +1789,30 @@ static int do_copy_data_nocache(struct sock *sk, int copy,
return 0;
}
+/* open-code sk_stream_memory_free() plus sent limit computation to
+ * avoid indirect calls in fast-path.
+ * Called under the msk socket lock, so we can avoid a bunch of ONCE
+ * annotations.
+ */
+static u32 mptcp_send_limit(const struct sock *sk)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+ u32 limit, not_sent;
+
+ if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf))
+ return 0;
+
+ limit = mptcp_notsent_lowat(sk);
+ if (limit == UINT_MAX)
+ return UINT_MAX;
+
+ not_sent = msk->write_seq - msk->snd_nxt;
+ if (not_sent >= limit)
+ return 0;
+
+ return limit - not_sent;
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1810,6 +1857,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct mptcp_data_frag *dfrag;
bool dfrag_collapsed;
size_t psize, offset;
+ u32 copy_limit;
+
+ /* ensure fitting the notsent_lowat() constraint */
+ copy_limit = mptcp_send_limit(sk);
+ if (!copy_limit)
+ goto wait_for_memory;
/* reuse tail pfrag, if possible, or carve a new one from the
* page allocator
@@ -1817,9 +1870,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
dfrag = mptcp_pending_tail(sk);
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
if (!dfrag_collapsed) {
- if (!sk_stream_memory_free(sk))
- goto wait_for_memory;
-
if (!mptcp_page_frag_refill(sk, pfrag))
goto wait_for_memory;
@@ -1834,6 +1884,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
offset = dfrag->offset + dfrag->data_len;
psize = pfrag->size - offset;
psize = min_t(size_t, psize, msg_data_left(msg));
+ psize = min_t(size_t, psize, copy_limit);
total_ts = psize + frag_truesize;
if (!sk_wmem_schedule(sk, total_ts))
@@ -1862,14 +1913,14 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!msk->first_pending)
WRITE_ONCE(msk->first_pending, dfrag);
}
- pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk,
+ pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk,
dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
!dfrag_collapsed);
continue;
wait_for_memory:
- mptcp_set_nospace(sk);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
__mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
@@ -1891,6 +1942,8 @@ do_error:
goto out;
}
+static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied);
+
static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
struct msghdr *msg,
size_t len, int flags,
@@ -1944,6 +1997,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
break;
}
+ mptcp_rcv_space_adjust(msk, copied);
return copied;
}
@@ -2012,13 +2066,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
do_div(grow, msk->rcvq_space.space);
rcvwin += (grow << 1);
- rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin),
+ rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) {
u32 window_clamp;
- window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf);
+ window_clamp = mptcp_win_from_space(sk, rcvbuf);
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make subflows follow along. If we do not do this, we
@@ -2033,8 +2087,9 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
ssk = mptcp_subflow_tcp_sock(subflow);
slow = lock_sock_fast(ssk);
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
- tcp_sk(ssk)->window_clamp = window_clamp;
- tcp_cleanup_rbuf(ssk, 1);
+ WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
+ if (tcp_can_send_ack(ssk))
+ tcp_cleanup_rbuf(ssk, 1);
unlock_sock_fast(ssk, slow);
}
}
@@ -2115,7 +2170,7 @@ static unsigned int mptcp_inq_hint(const struct sock *sk)
skb = skb_peek(&msk->receive_queue);
if (skb) {
- u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+ u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq;
if (hint_val >= INT_MAX)
return INT_MAX;
@@ -2157,7 +2212,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
cmsg_flags = MPTCP_CMSG_INQ;
while (copied < len) {
- int bytes_read;
+ int err, bytes_read;
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags);
if (unlikely(bytes_read < 0)) {
@@ -2168,13 +2223,10 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
copied += bytes_read;
- /* be sure to advertise window change */
- mptcp_cleanup_rbuf(msk);
-
if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk))
continue;
- /* only the master socket status is relevant here. The exit
+ /* only the MPTCP socket status is relevant here. The exit
* conditions mirror closely tcp_recvmsg()
*/
if (copied >= target)
@@ -2218,10 +2270,17 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
}
}
- pr_debug("block timeout %ld", timeo);
- sk_wait_data(sk, &timeo, NULL);
+ pr_debug("block timeout %ld\n", timeo);
+ mptcp_cleanup_rbuf(msk, copied);
+ err = sk_wait_data(sk, &timeo, NULL);
+ if (err < 0) {
+ err = copied ? : err;
+ goto out_err;
+ }
}
+ mptcp_cleanup_rbuf(msk, copied);
+
out_err:
if (cmsg_flags && copied >= 0) {
if (cmsg_flags & MPTCP_CMSG_TS)
@@ -2234,11 +2293,9 @@ out_err:
}
}
- pr_debug("msk=%p rx queue empty=%d:%d copied=%d",
+ pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n",
msk, skb_queue_empty_lockless(&sk->sk_receive_queue),
skb_queue_empty(&msk->receive_queue), copied);
- if (!(flags & MSG_PEEK))
- mptcp_rcv_space_adjust(msk, copied);
release_sock(sk);
return copied;
@@ -2296,7 +2353,7 @@ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
continue;
}
- if (subflow->backup) {
+ if (subflow->backup || subflow->request_bkup) {
if (!backup)
backup = ssk;
continue;
@@ -2478,6 +2535,12 @@ out:
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow)
{
+ /* The first subflow can already be closed and still in the list */
+ if (subflow->close_event_done)
+ return;
+
+ subflow->close_event_done = true;
+
if (sk->sk_state == TCP_ESTABLISHED)
mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
@@ -2503,8 +2566,11 @@ static void __mptcp_close_subflow(struct sock *sk)
mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int ssk_state = inet_sk_state_load(ssk);
- if (inet_sk_state_load(ssk) != TCP_CLOSE)
+ if (ssk_state != TCP_CLOSE &&
+ (ssk_state != TCP_CLOSE_WAIT ||
+ inet_sk_state_load(sk) != TCP_ESTABLISHED))
continue;
/* 'subflow_data_ready' will re-sched once rx queue is empty */
@@ -2542,7 +2608,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
slow = lock_sock_fast(tcp_sk);
if (tcp_sk->sk_state != TCP_CLOSE) {
- tcp_send_active_reset(tcp_sk, GFP_ATOMIC);
+ mptcp_send_active_reset_reason(tcp_sk);
tcp_set_state(tcp_sk, TCP_CLOSE);
}
unlock_sock_fast(tcp_sk, slow);
@@ -2665,8 +2731,8 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp)
return;
- close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
- mptcp_close_timeout(sk);
+ close_timeout = (unsigned long)inet_csk(sk)->icsk_mtup.probe_timestamp -
+ tcp_jiffies32 + jiffies + mptcp_close_timeout(sk);
/* the close timeout takes precedence on the fail one, and here at least one of
* them is active
@@ -2684,7 +2750,7 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
if (!ssk)
return;
- pr_debug("MP_FAIL doesn't respond, reset the subflow");
+ pr_debug("MP_FAIL doesn't respond, reset the subflow\n");
slow = lock_sock_fast(ssk);
mptcp_subflow_reset(ssk);
@@ -2759,7 +2825,7 @@ static void __mptcp_init_sock(struct sock *sk)
__skb_queue_head_init(&msk->receive_queue);
msk->out_of_order_queue = RB_ROOT;
msk->first_pending = NULL;
- msk->rmem_fwd_alloc = 0;
+ WRITE_ONCE(msk->rmem_fwd_alloc, 0);
WRITE_ONCE(msk->rmem_released, 0);
msk->timer_ival = TCP_RTO_MIN;
msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
@@ -2770,6 +2836,9 @@ static void __mptcp_init_sock(struct sock *sk)
WRITE_ONCE(msk->allow_infinite_fallback, true);
msk->recovery = false;
msk->subflow_id = 1;
+ msk->last_data_sent = tcp_jiffies32;
+ msk->last_data_recv = tcp_jiffies32;
+ msk->last_ack_recv = tcp_jiffies32;
mptcp_pm_data_init(msk);
@@ -2783,7 +2852,8 @@ static void mptcp_ca_reset(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_assign_congestion_control(sk);
- strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name);
+ strscpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name,
+ sizeof(mptcp_sk(sk)->ca_name));
/* no need to keep a reference to the ops, the name will suffice */
tcp_cleanup_congestion_control(sk);
@@ -2803,8 +2873,10 @@ static int mptcp_init_sock(struct sock *sk)
if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
return -ENOMEM;
+ rcu_read_lock();
ret = mptcp_init_sched(mptcp_sk(sk),
mptcp_sched_find(mptcp_get_scheduler(net)));
+ rcu_read_unlock();
if (ret)
return ret;
@@ -2854,7 +2926,7 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
break;
default:
if (__mptcp_check_fallback(mptcp_sk(sk))) {
- pr_debug("Fallback");
+ pr_debug("Fallback\n");
ssk->sk_shutdown |= how;
tcp_shutdown(ssk, how);
@@ -2864,7 +2936,7 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt);
mptcp_schedule_work(sk);
} else {
- pr_debug("Sending DATA_FIN on subflow %p", ssk);
+ pr_debug("Sending DATA_FIN on subflow %p\n", ssk);
tcp_send_ack(ssk);
if (!mptcp_rtx_timer_pending(sk))
mptcp_reset_rtx_timer(sk);
@@ -2884,9 +2956,14 @@ void mptcp_set_state(struct sock *sk, int state)
if (oldstate != TCP_ESTABLISHED)
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB);
break;
-
+ case TCP_CLOSE_WAIT:
+ /* Unlike TCP, MPTCP sk would not have the TCP_SYN_RECV state:
+ * MPTCP "accepted" sockets will be created later on. So no
+ * transition from TCP_SYN_RECV to TCP_CLOSE_WAIT.
+ */
+ break;
default:
- if (oldstate == TCP_ESTABLISHED)
+ if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT)
MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB);
}
@@ -2925,7 +3002,7 @@ static void mptcp_check_send_data_fin(struct sock *sk)
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
- pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu",
+ pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu\n",
msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk),
msk->snd_nxt, msk->write_seq);
@@ -2949,7 +3026,7 @@ static void __mptcp_wr_shutdown(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d",
+ pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d\n",
msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state,
!!mptcp_send_head(sk));
@@ -2964,7 +3041,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
might_sleep();
@@ -2975,7 +3052,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
sk->sk_prot->destroy(sk);
- WARN_ON_ONCE(msk->rmem_fwd_alloc);
+ WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc));
WARN_ON_ONCE(msk->rmem_released);
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
@@ -3072,9 +3149,8 @@ cleanup:
mptcp_set_state(sk, TCP_CLOSE);
sock_hold(sk);
- pr_debug("msk=%p state=%d", sk, sk->sk_state);
- if (msk->token)
- mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
+ pr_debug("msk=%p state=%d\n", sk, sk->sk_state);
+ mptcp_pm_connection_closed(msk);
if (sk->sk_state == TCP_CLOSE) {
__mptcp_destroy_sock(sk);
@@ -3140,8 +3216,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
mptcp_stop_rtx_timer(sk);
mptcp_stop_tout_timer(sk);
- if (msk->token)
- mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
+ mptcp_pm_connection_closed(msk);
/* msk->subflow is still intact, the following will not free the first
* subflow
@@ -3150,16 +3225,16 @@ static int mptcp_disconnect(struct sock *sk, int flags)
WRITE_ONCE(msk->flags, 0);
msk->cb_flags = 0;
msk->recovery = false;
- msk->can_ack = false;
- msk->fully_established = false;
- msk->rcv_data_fin = false;
- msk->snd_data_fin_enable = false;
- msk->rcv_fastclose = false;
- msk->use_64bit_ack = false;
- msk->bytes_consumed = 0;
+ WRITE_ONCE(msk->can_ack, false);
+ WRITE_ONCE(msk->fully_established, false);
+ WRITE_ONCE(msk->rcv_data_fin, false);
+ WRITE_ONCE(msk->snd_data_fin_enable, false);
+ WRITE_ONCE(msk->rcv_fastclose, false);
+ WRITE_ONCE(msk->use_64bit_ack, false);
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
mptcp_pm_data_reset(msk);
mptcp_ca_reset(sk);
+ msk->bytes_consumed = 0;
msk->bytes_acked = 0;
msk->bytes_received = 0;
msk->bytes_sent = 0;
@@ -3250,17 +3325,17 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
mptcp_copy_ip_options(nsk, sk);
msk = mptcp_sk(nsk);
- msk->local_key = subflow_req->local_key;
- msk->token = subflow_req->token;
+ WRITE_ONCE(msk->local_key, subflow_req->local_key);
+ WRITE_ONCE(msk->token, subflow_req->token);
msk->in_accept_queue = 1;
WRITE_ONCE(msk->fully_established, false);
if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD)
WRITE_ONCE(msk->csum_enabled, true);
- msk->write_seq = subflow_req->idsn + 1;
- msk->snd_nxt = msk->write_seq;
- msk->snd_una = msk->write_seq;
- msk->wnd_end = msk->snd_nxt + tcp_sk(ssk)->snd_wnd;
+ WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1);
+ WRITE_ONCE(msk->snd_nxt, msk->write_seq);
+ WRITE_ONCE(msk->snd_una, msk->write_seq);
+ WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
mptcp_init_sched(msk, mptcp_sk(sk)->sched);
@@ -3363,9 +3438,6 @@ void __mptcp_data_acked(struct sock *sk)
__mptcp_clean_una(sk);
else
__set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags);
-
- if (mptcp_pending_data_fin_ack(sk))
- mptcp_schedule_work(sk);
}
void __mptcp_check_push(struct sock *sk, struct sock *ssk)
@@ -3448,7 +3520,7 @@ static void schedule_3rdack_retransmission(struct sock *ssk)
struct tcp_sock *tp = tcp_sk(ssk);
unsigned long timeout;
- if (mptcp_subflow_ctx(ssk)->fully_established)
+ if (READ_ONCE(mptcp_subflow_ctx(ssk)->fully_established))
return;
/* reschedule with a timeout above RTT, as we must look only for drop */
@@ -3459,7 +3531,8 @@ static void schedule_3rdack_retransmission(struct sock *ssk)
timeout += jiffies;
WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
- icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ smp_store_release(&icsk->icsk_ack.pending,
+ icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER);
icsk->icsk_ack.timeout = timeout;
sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout);
}
@@ -3492,7 +3565,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk, long status)
static int mptcp_hash(struct sock *sk)
{
/* should never be called,
- * we hash the TCP subflows not the master socket
+ * we hash the TCP subflows not the MPTCP socket
*/
WARN_ON_ONCE(1);
return 0;
@@ -3507,7 +3580,7 @@ static int mptcp_get_port(struct sock *sk, unsigned short snum)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- pr_debug("msk=%p, ssk=%p", msk, msk->first);
+ pr_debug("msk=%p, ssk=%p\n", msk, msk->first);
if (WARN_ON_ONCE(!msk->first))
return -EINVAL;
@@ -3524,7 +3597,7 @@ void mptcp_finish_connect(struct sock *ssk)
sk = subflow->conn;
msk = mptcp_sk(sk);
- pr_debug("msk=%p, token=%u", sk, subflow->token);
+ pr_debug("msk=%p, token=%u\n", sk, subflow->token);
subflow->map_seq = subflow->iasn;
subflow->map_subflow_seq = 1;
@@ -3553,7 +3626,7 @@ bool mptcp_finish_join(struct sock *ssk)
struct sock *parent = (void *)msk;
bool ret = true;
- pr_debug("msk=%p, subflow=%p", msk, subflow);
+ pr_debug("msk=%p, subflow=%p\n", msk, subflow);
/* mptcp socket already closing? */
if (!mptcp_is_fully_established(parent)) {
@@ -3599,7 +3672,7 @@ err_prohibited:
static void mptcp_shutdown(struct sock *sk, int how)
{
- pr_debug("sk=%p, how=%d", sk, how);
+ pr_debug("sk=%p, how=%d\n", sk, how);
if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk))
__mptcp_wr_shutdown(sk);
@@ -3672,13 +3745,6 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
return 0;
}
-static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
- struct mptcp_subflow_context *subflow)
-{
- subflow->request_mptcp = 0;
- __mptcp_do_fallback(msk);
-}
-
static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct mptcp_subflow_context *subflow;
@@ -3699,10 +3765,19 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info))
mptcp_subflow_early_fallback(msk, subflow);
#endif
- if (subflow->request_mptcp && mptcp_token_new_connect(ssk)) {
- MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT);
- mptcp_subflow_early_fallback(msk, subflow);
+ if (subflow->request_mptcp) {
+ if (mptcp_active_should_disable(sk)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDISABLED);
+ mptcp_subflow_early_fallback(msk, subflow);
+ } else if (mptcp_token_new_connect(ssk) < 0) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT);
+ mptcp_subflow_early_fallback(msk, subflow);
+ }
}
+
+ WRITE_ONCE(msk->write_seq, subflow->idsn);
+ WRITE_ONCE(msk->snd_nxt, subflow->idsn);
+ WRITE_ONCE(msk->snd_una, subflow->idsn);
if (likely(!__mptcp_check_fallback(msk)))
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE);
@@ -3767,6 +3842,7 @@ static struct proto mptcp_prot = {
.unhash = mptcp_unhash,
.get_port = mptcp_get_port,
.forward_alloc_get = mptcp_forward_alloc_get,
+ .stream_memory_free = mptcp_stream_memory_free,
.sockets_allocated = &mptcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
@@ -3815,7 +3891,7 @@ static int mptcp_listen(struct socket *sock, int backlog)
struct sock *ssk;
int err;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
lock_sock(sk);
@@ -3849,13 +3925,12 @@ unlock:
}
static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
- int flags, bool kern)
+ struct proto_accept_arg *arg)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
struct sock *ssk, *newsk;
- int err;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
/* Buggy applications can call accept on socket states other then LISTEN
* but no need to allocate the first subflow just to error out.
@@ -3864,12 +3939,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (!ssk)
return -EINVAL;
- pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
- newsk = inet_csk_accept(ssk, flags, &err, kern);
+ pr_debug("ssk=%p, listener=%p\n", ssk, mptcp_subflow_ctx(ssk));
+ newsk = inet_csk_accept(ssk, arg);
if (!newsk)
- return err;
+ return arg->err;
- pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
+ pr_debug("newsk=%p, subflow is mptcp=%d\n", newsk, sk_is_mptcp(newsk));
if (sk_is_mptcp(newsk)) {
struct mptcp_subflow_context *subflow;
struct sock *new_mptcp_sock;
@@ -3888,7 +3963,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
newsk = new_mptcp_sock;
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
- newsk->sk_kern_sock = kern;
+ newsk->sk_kern_sock = arg->kern;
lock_sock(newsk);
__inet_accept(sock, newsock, newsk);
@@ -3916,10 +3991,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
mptcp_set_state(newsk, TCP_CLOSE);
}
} else {
- MPTCP_INC_STATS(sock_net(ssk),
- MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
tcpfallback:
- newsk->sk_kern_sock = kern;
+ newsk->sk_kern_sock = arg->kern;
lock_sock(newsk);
__inet_accept(sock, newsock, newsk);
/* we are being invoked after accepting a non-mp-capable
@@ -3940,12 +4013,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
- if (sk_stream_is_writeable(sk))
+ if (__mptcp_stream_is_writeable(sk, 1))
return EPOLLOUT | EPOLLWRNORM;
- mptcp_set_nospace(sk);
- smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
- if (sk_stream_is_writeable(sk))
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */
+ if (__mptcp_stream_is_writeable(sk, 1))
return EPOLLOUT | EPOLLWRNORM;
return 0;
@@ -3964,7 +4037,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
sock_poll_wait(file, sock, wait);
state = inet_sk_state_load(sk);
- pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
+ pr_debug("msk=%p state=%d flags=%lx\n", msk, state, msk->flags);
if (state == TCP_LISTEN) {
struct sock *ssk = READ_ONCE(msk->first);
@@ -4076,11 +4149,13 @@ void __init mptcp_proto_init(void)
if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
panic("Failed to allocate MPTCP pcpu counter\n");
- init_dummy_netdev(&mptcp_napi_dev);
+ mptcp_napi_dev = alloc_netdev_dummy(0);
+ if (!mptcp_napi_dev)
+ panic("Failed to allocate MPTCP dummy netdev\n");
for_each_possible_cpu(cpu) {
delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
INIT_LIST_HEAD(&delegated->head);
- netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi,
+ netif_napi_add_tx(mptcp_napi_dev, &delegated->napi,
mptcp_napi_poll);
napi_enable(&delegated->napi);
}
@@ -4139,7 +4214,7 @@ int __init mptcp_proto_v6_init(void)
int err;
mptcp_v6_prot = mptcp_prot;
- strcpy(mptcp_v6_prot.name, "MPTCPv6");
+ strscpy(mptcp_v6_prot.name, "MPTCPv6", sizeof(mptcp_v6_prot.name));
mptcp_v6_prot.slab = NULL;
mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 07f6242afc1a..ad21925af061 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -12,8 +12,7 @@
#include <net/inet_connection_sock.h>
#include <uapi/linux/mptcp.h>
#include <net/genetlink.h>
-
-#include "mptcp_pm_gen.h"
+#include <net/rstreason.h>
#define MPTCP_SUPPORTED_VERSION 1
@@ -113,10 +112,9 @@
#define MPTCP_RST_TRANSIENT BIT(0)
/* MPTCP socket atomic flags */
-#define MPTCP_NOSPACE 1
-#define MPTCP_WORK_RTX 2
-#define MPTCP_FALLBACK_DONE 4
-#define MPTCP_WORK_CLOSE_SUBFLOW 5
+#define MPTCP_WORK_RTX 1
+#define MPTCP_FALLBACK_DONE 2
+#define MPTCP_WORK_CLOSE_SUBFLOW 3
/* MPTCP socket release cb flags */
#define MPTCP_PUSH_PENDING 1
@@ -151,22 +149,24 @@ struct mptcp_options_received {
u32 subflow_seq;
u16 data_len;
__sum16 csum;
- u16 suboptions;
+ struct_group(status,
+ u16 suboptions;
+ u16 use_map:1,
+ dsn64:1,
+ data_fin:1,
+ use_ack:1,
+ ack64:1,
+ mpc_map:1,
+ reset_reason:4,
+ reset_transient:1,
+ echo:1,
+ backup:1,
+ deny_join_id0:1,
+ __unused:2;
+ );
+ u8 join_id;
u32 token;
u32 nonce;
- u16 use_map:1,
- dsn64:1,
- data_fin:1,
- use_ack:1,
- ack64:1,
- mpc_map:1,
- reset_reason:4,
- reset_transient:1,
- echo:1,
- backup:1,
- deny_join_id0:1,
- __unused:2;
- u8 join_id;
u64 thmac;
u8 hmac[MPTCPOPT_HMAC_LEN];
struct mptcp_addr_info addr;
@@ -238,6 +238,12 @@ struct mptcp_pm_data {
struct mptcp_rm_list rm_list_rx;
};
+struct mptcp_pm_local {
+ struct mptcp_addr_info addr;
+ u8 flags;
+ int ifindex;
+};
+
struct mptcp_pm_addr_entry {
struct list_head list;
struct mptcp_addr_info addr;
@@ -260,8 +266,10 @@ struct mptcp_data_frag {
struct mptcp_sock {
/* inet_connection_sock must be the first member */
struct inet_connection_sock sk;
- u64 local_key;
- u64 remote_key;
+ u64 local_key; /* protected by the first subflow socket lock
+ * lockless access read
+ */
+ u64 remote_key; /* same as above */
u64 write_seq;
u64 bytes_sent;
u64 snd_nxt;
@@ -281,6 +289,9 @@ struct mptcp_sock {
u64 bytes_acked;
u64 snd_una;
u64 wnd_end;
+ u32 last_data_sent;
+ u32 last_data_recv;
+ u32 last_ack_recv;
unsigned long timer_ival;
u32 token;
int rmem_released;
@@ -306,6 +317,10 @@ struct mptcp_sock {
in_accept_queue:1,
free_first:1,
rcvspace_init:1;
+ u32 notsent_lowat;
+ int keepalive_cnt;
+ int keepalive_idle;
+ int keepalive_intvl;
struct work_struct work;
struct sk_buff *ooo_last_skb;
struct rb_root out_of_order_queue;
@@ -341,12 +356,30 @@ struct mptcp_sock {
#define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \
list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node)
+extern struct genl_family mptcp_genl_family;
+
static inline void msk_owned_by_me(const struct mptcp_sock *msk)
{
sock_owned_by_me((const struct sock *)msk);
}
+#ifdef CONFIG_DEBUG_NET
+/* MPTCP-specific: we might (indirectly) call this helper with the wrong sk */
+#undef tcp_sk
+#define tcp_sk(ptr) ({ \
+ typeof(ptr) _ptr = (ptr); \
+ WARN_ON(_ptr->sk_protocol != IPPROTO_TCP); \
+ container_of_const(_ptr, struct tcp_sock, inet_conn.icsk_inet.sk); \
+})
+#define mptcp_sk(ptr) ({ \
+ typeof(ptr) _ptr = (ptr); \
+ WARN_ON(_ptr->sk_protocol != IPPROTO_MPTCP); \
+ container_of_const(_ptr, struct mptcp_sock, sk.icsk_inet.sk); \
+})
+
+#else /* !CONFIG_DEBUG_NET */
#define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk)
+#endif
/* the msk socket don't use the backlog, also account for the bulk
* free memory
@@ -361,6 +394,11 @@ static inline int mptcp_win_from_space(const struct sock *sk, int space)
return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space);
}
+static inline int mptcp_space_from_win(const struct sock *sk, int win)
+{
+ return __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, win);
+}
+
static inline int __mptcp_space(const struct sock *sk)
{
return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
@@ -400,7 +438,7 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- if (msk->snd_una == READ_ONCE(msk->snd_nxt))
+ if (msk->snd_una == msk->snd_nxt)
return NULL;
return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list);
@@ -418,6 +456,7 @@ struct mptcp_subflow_request_sock {
u16 mp_capable : 1,
mp_join : 1,
backup : 1,
+ request_bkup : 1,
csum_reqd : 1,
allow_join_id0 : 1;
u8 local_id;
@@ -476,7 +515,6 @@ struct mptcp_subflow_context {
request_bkup : 1,
mp_capable : 1, /* remote is MPTCP capable */
mp_join : 1, /* remote is JOINing */
- fully_established : 1, /* path validated */
pm_notified : 1, /* PM hook called for established status */
conn_finished : 1,
map_valid : 1,
@@ -493,9 +531,13 @@ struct mptcp_subflow_context {
stale : 1, /* unable to snd/rcv data, do not use for xmit */
valid_csum_seen : 1, /* at least one csum validated */
is_mptfo : 1, /* subflow is doing TFO */
- __unused : 10;
+ close_event_done : 1, /* has done the post-closed part */
+ mpc_drop : 1, /* the MPC option has been dropped in a rtx */
+ __unused : 9;
bool data_avail;
bool scheduled;
+ bool pm_listener; /* a listener managed by the kernel PM? */
+ bool fully_established; /* path validated */
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@@ -538,7 +580,7 @@ struct mptcp_subflow_context {
static inline struct mptcp_subflow_context *
mptcp_subflow_ctx(const struct sock *sk)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
/* Use RCU on icsk_ulp_data only for sock diag code */
return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data;
@@ -558,6 +600,43 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow)
WRITE_ONCE(subflow->local_id, -1);
}
+/* Convert reset reasons in MPTCP to enum sk_rst_reason type */
+static inline enum sk_rst_reason
+sk_rst_convert_mptcp_reason(u32 reason)
+{
+ switch (reason) {
+ case MPTCP_RST_EUNSPEC:
+ return SK_RST_REASON_MPTCP_RST_EUNSPEC;
+ case MPTCP_RST_EMPTCP:
+ return SK_RST_REASON_MPTCP_RST_EMPTCP;
+ case MPTCP_RST_ERESOURCE:
+ return SK_RST_REASON_MPTCP_RST_ERESOURCE;
+ case MPTCP_RST_EPROHIBIT:
+ return SK_RST_REASON_MPTCP_RST_EPROHIBIT;
+ case MPTCP_RST_EWQ2BIG:
+ return SK_RST_REASON_MPTCP_RST_EWQ2BIG;
+ case MPTCP_RST_EBADPERF:
+ return SK_RST_REASON_MPTCP_RST_EBADPERF;
+ case MPTCP_RST_EMIDDLEBOX:
+ return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX;
+ default:
+ /* It should not happen, or else errors may occur
+ * in MPTCP layer
+ */
+ return SK_RST_REASON_ERROR;
+ }
+}
+
+static inline void
+mptcp_send_active_reset_reason(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ enum sk_rst_reason reason;
+
+ reason = sk_rst_convert_mptcp_reason(subflow->reset_reason);
+ tcp_send_active_reset(sk, GFP_ATOMIC, reason);
+}
+
static inline u64
mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
{
@@ -622,6 +701,12 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net);
unsigned int mptcp_close_timeout(const struct sock *sk);
int mptcp_get_pm_type(const struct net *net);
const char *mptcp_get_scheduler(const struct net *net);
+
+void mptcp_active_disable(struct sock *sk);
+bool mptcp_active_should_disable(struct sock *ssk);
+void mptcp_active_enable(struct sock *sk);
+
+void mptcp_get_available_schedulers(char *buf, size_t maxlen);
void __mptcp_subflow_fully_established(struct mptcp_sock *msk,
struct mptcp_subflow_context *subflow,
const struct mptcp_options_received *mp_opt);
@@ -649,7 +734,7 @@ bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr);
/* called with sk socket lock held */
-int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local,
const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
struct socket **new_sock);
@@ -677,10 +762,15 @@ static inline u64 mptcp_data_avail(const struct mptcp_sock *msk)
static inline bool mptcp_epollin_ready(const struct sock *sk)
{
+ u64 data_avail = mptcp_data_avail(mptcp_sk(sk));
+
+ if (!data_avail)
+ return false;
+
/* mptcp doesn't have to deal with small skbs in the receive queue,
- * at it can always coalesce them
+ * as it can always coalesce them
*/
- return (mptcp_data_avail(mptcp_sk(sk)) >= sk->sk_rcvlowat) ||
+ return (data_avail >= sk->sk_rcvlowat) ||
(mem_cgroup_sockets_enabled && sk->sk_memcg &&
mem_cgroup_under_socket_pressure(sk->sk_memcg)) ||
READ_ONCE(tcp_memory_pressure);
@@ -697,7 +787,7 @@ static inline bool __tcp_can_send(const struct sock *ssk)
static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
- if (subflow->request_join && !subflow->fully_established)
+ if (subflow->request_join && !READ_ONCE(subflow->fully_established))
return false;
return __tcp_can_send(mptcp_subflow_tcp_sock(subflow));
@@ -790,14 +880,36 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
}
+static inline u32 mptcp_notsent_lowat(const struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ u32 val;
+
+ val = READ_ONCE(mptcp_sk(sk)->notsent_lowat);
+ return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
+}
+
+static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+ u32 notsent_bytes;
+
+ notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt);
+ return (notsent_bytes << wake) < mptcp_notsent_lowat(sk);
+}
+
+static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake)
+{
+ return mptcp_stream_memory_free(sk, wake) &&
+ __sk_stream_is_writeable(sk, wake);
+}
+
static inline void mptcp_write_space(struct sock *sk)
{
- if (sk_stream_is_writeable(sk)) {
- /* pairs with memory barrier in mptcp_poll */
- smp_mb();
- if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags))
- sk_stream_write_space(sk);
- }
+ /* pairs with memory barrier in mptcp_poll */
+ smp_mb();
+ if (mptcp_stream_memory_free(sk, 1))
+ sk_stream_write_space(sk);
}
static inline void __mptcp_sync_sndbuf(struct sock *sk)
@@ -808,7 +920,7 @@ static inline void __mptcp_sync_sndbuf(struct sock *sk)
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
return;
- new_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[0];
+ new_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]);
mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf);
@@ -901,6 +1013,8 @@ void mptcp_pm_add_addr_received(const struct sock *ssk,
void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
+bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *remote);
void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk);
void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list);
@@ -920,29 +1034,19 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk,
struct mptcp_pm_add_entry *
mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
-int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
- unsigned int id,
- u8 *flags, int *ifindex);
-int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
- u8 *flags, int *ifindex);
-int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
- unsigned int id,
- u8 *flags, int *ifindex);
-int mptcp_pm_set_flags(struct net *net, struct nlattr *token,
- struct mptcp_pm_addr_entry *loc,
- struct mptcp_pm_addr_entry *rem, u8 bkup);
-int mptcp_pm_nl_set_flags(struct net *net, struct mptcp_pm_addr_entry *addr, u8 bkup);
-int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token,
- struct mptcp_pm_addr_entry *loc,
- struct mptcp_pm_addr_entry *rem, u8 bkup);
+bool mptcp_lookup_subflow_by_saddr(const struct list_head *list,
+ const struct mptcp_addr_info *saddr);
+bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info);
+int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info);
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool echo);
int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
-int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
-void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list);
-void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
- struct list_head *rm_list);
+void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry);
void mptcp_free_local_addr_list(struct mptcp_sock *msk);
@@ -958,6 +1062,8 @@ void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflo
const struct mptcp_options_received *mp_opt);
void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
struct request_sock *req);
+int mptcp_nl_fill_addr(struct sk_buff *skb,
+ struct mptcp_pm_addr_entry *entry);
static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
{
@@ -1022,6 +1128,18 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
+bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc);
+bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
+bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
+int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb);
+int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
+ struct netlink_callback *cb);
+int mptcp_userspace_pm_dump_addr(struct sk_buff *msg,
+ struct netlink_callback *cb);
+int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info);
+int mptcp_userspace_pm_get_addr(struct sk_buff *skb,
+ struct genl_info *info);
static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow)
{
@@ -1034,8 +1152,6 @@ static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflo
void __init mptcp_pm_nl_init(void);
void mptcp_pm_nl_work(struct mptcp_sock *msk);
-void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
- const struct mptcp_rm_list *rm_list);
unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk);
unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk);
unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk);
@@ -1055,7 +1171,6 @@ static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk)
spin_unlock_bh(&msk->pm.lock);
}
-void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk);
void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk);
static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)
@@ -1081,9 +1196,11 @@ static inline bool mptcp_check_fallback(const struct sock *sk)
static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
{
if (__mptcp_check_fallback(msk)) {
- pr_debug("TCP fallback already done (msk=%p)", msk);
+ pr_debug("TCP fallback already done (msk=%p)\n", msk);
return;
}
+ if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback)))
+ return;
set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
@@ -1117,7 +1234,15 @@ static inline void mptcp_do_fallback(struct sock *ssk)
}
}
-#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
+#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)\n", __func__, a)
+
+static inline void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ pr_fallback(msk);
+ subflow->request_mptcp = 0;
+ __mptcp_do_fallback(msk);
+}
static inline bool mptcp_check_infinite_map(struct sk_buff *skb)
{
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
index 4ab0693c069c..df7dbcfa3b71 100644
--- a/net/mptcp/sched.c
+++ b/net/mptcp/sched.c
@@ -51,6 +51,26 @@ struct mptcp_sched_ops *mptcp_sched_find(const char *name)
return ret;
}
+/* Build string with list of available scheduler values.
+ * Similar to tcp_get_available_congestion_control()
+ */
+void mptcp_get_available_schedulers(char *buf, size_t maxlen)
+{
+ struct mptcp_sched_ops *sched;
+ size_t offs = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", sched->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
+ }
+ rcu_read_unlock();
+}
+
int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
{
if (!sched->get_subflow)
@@ -64,7 +84,7 @@ int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
list_add_tail_rcu(&sched->list, &mptcp_sched_list);
spin_unlock(&mptcp_sched_list_lock);
- pr_debug("%s registered", sched->name);
+ pr_debug("%s registered\n", sched->name);
return 0;
}
@@ -96,7 +116,7 @@ int mptcp_init_sched(struct mptcp_sock *msk,
if (msk->sched->init)
msk->sched->init(msk);
- pr_debug("sched=%s", msk->sched->name);
+ pr_debug("sched=%s\n", msk->sched->name);
return 0;
}
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index c40f1428e602..505445a9598f 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -181,8 +181,6 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
switch (optname) {
case SO_KEEPALIVE:
- mptcp_sol_socket_sync_intval(msk, optname, val);
- return 0;
case SO_DEBUG:
case SO_MARK:
case SO_PRIORITY:
@@ -618,26 +616,42 @@ static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t
}
if (ret == 0)
- strcpy(msk->ca_name, name);
+ strscpy(msk->ca_name, name, sizeof(msk->ca_name));
release_sock(sk);
return ret;
}
-static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval,
- unsigned int optlen)
+static int __mptcp_setsockopt_set_val(struct mptcp_sock *msk, int max,
+ int (*set_val)(struct sock *, int),
+ int *msk_val, int val)
{
struct mptcp_subflow_context *subflow;
- struct sock *sk = (struct sock *)msk;
- int val;
+ int err = 0;
- if (optlen < sizeof(int))
- return -EINVAL;
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int ret;
- if (copy_from_sockptr(&val, optval, sizeof(val)))
- return -EFAULT;
+ lock_sock(ssk);
+ ret = set_val(ssk, val);
+ err = err ? : ret;
+ release_sock(ssk);
+ }
+
+ if (!err) {
+ *msk_val = val;
+ sockopt_seq_inc(msk);
+ }
+
+ return err;
+}
+
+static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
- lock_sock(sk);
sockopt_seq_inc(msk);
msk->cork = !!val;
mptcp_for_each_subflow(msk, subflow) {
@@ -649,25 +663,15 @@ static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optva
}
if (!val)
mptcp_check_and_set_pending(sk);
- release_sock(sk);
return 0;
}
-static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval,
- unsigned int optlen)
+static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
- int val;
- if (optlen < sizeof(int))
- return -EINVAL;
-
- if (copy_from_sockptr(&val, optval, sizeof(val)))
- return -EFAULT;
-
- lock_sock(sk);
sockopt_seq_inc(msk);
msk->nodelay = !!val;
mptcp_for_each_subflow(msk, subflow) {
@@ -679,8 +683,6 @@ static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t op
}
if (val)
mptcp_check_and_set_pending(sk);
- release_sock(sk);
-
return 0;
}
@@ -803,25 +805,10 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
int ret, val;
switch (optname) {
- case TCP_INQ:
- ret = mptcp_get_int_option(msk, optval, optlen, &val);
- if (ret)
- return ret;
- if (val < 0 || val > 1)
- return -EINVAL;
-
- lock_sock(sk);
- msk->recvmsg_inq = !!val;
- release_sock(sk);
- return 0;
case TCP_ULP:
return -EOPNOTSUPP;
case TCP_CONGESTION:
return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
- case TCP_CORK:
- return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen);
- case TCP_NODELAY:
- return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen);
case TCP_DEFER_ACCEPT:
/* See tcp.c: TCP_DEFER_ACCEPT does not fail */
mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
@@ -834,7 +821,50 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
optval, optlen);
}
- return -EOPNOTSUPP;
+ ret = mptcp_get_int_option(msk, optval, optlen, &val);
+ if (ret)
+ return ret;
+
+ lock_sock(sk);
+ switch (optname) {
+ case TCP_INQ:
+ if (val < 0 || val > 1)
+ ret = -EINVAL;
+ else
+ msk->recvmsg_inq = !!val;
+ break;
+ case TCP_NOTSENT_LOWAT:
+ WRITE_ONCE(msk->notsent_lowat, val);
+ mptcp_write_space(sk);
+ break;
+ case TCP_CORK:
+ ret = __mptcp_setsockopt_sol_tcp_cork(msk, val);
+ break;
+ case TCP_NODELAY:
+ ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val);
+ break;
+ case TCP_KEEPIDLE:
+ ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPIDLE,
+ &tcp_sock_set_keepidle_locked,
+ &msk->keepalive_idle, val);
+ break;
+ case TCP_KEEPINTVL:
+ ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPINTVL,
+ &tcp_sock_set_keepintvl,
+ &msk->keepalive_intvl, val);
+ break;
+ case TCP_KEEPCNT:
+ ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPCNT,
+ &tcp_sock_set_keepcnt,
+ &msk->keepalive_cnt,
+ val);
+ break;
+ default:
+ ret = -ENOPROTOOPT;
+ }
+
+ release_sock(sk);
+ return ret;
}
int mptcp_setsockopt(struct sock *sk, int level, int optname,
@@ -843,7 +873,7 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname,
struct mptcp_sock *msk = mptcp_sk(sk);
struct sock *ssk;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
if (level == SOL_SOCKET)
return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
@@ -907,6 +937,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
struct sock *sk = (struct sock *)msk;
u32 flags = 0;
bool slow;
+ u32 now;
memset(info, 0, sizeof(*info));
@@ -935,14 +966,9 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
if (READ_ONCE(msk->can_ack))
flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
info->mptcpi_flags = flags;
- mptcp_data_lock(sk);
- info->mptcpi_snd_una = msk->snd_una;
- info->mptcpi_rcv_nxt = msk->ack_seq;
- info->mptcpi_bytes_acked = msk->bytes_acked;
- mptcp_data_unlock(sk);
slow = lock_sock_fast(sk);
- info->mptcpi_csum_enabled = msk->csum_enabled;
+ info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
info->mptcpi_token = msk->token;
info->mptcpi_write_seq = msk->write_seq;
info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
@@ -951,7 +977,17 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
info->mptcpi_bytes_retrans = msk->bytes_retrans;
info->mptcpi_subflows_total = info->mptcpi_subflows +
__mptcp_has_initial_subflow(msk);
+ now = tcp_jiffies32;
+ info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent);
+ info->mptcpi_last_data_recv = jiffies_to_msecs(now - msk->last_data_recv);
unlock_sock_fast(sk, slow);
+
+ mptcp_data_lock(sk);
+ info->mptcpi_last_ack_recv = jiffies_to_msecs(now - msk->last_ack_recv);
+ info->mptcpi_snd_una = msk->snd_una;
+ info->mptcpi_rcv_nxt = msk->ack_seq;
+ info->mptcpi_bytes_acked = msk->bytes_acked;
+ mptcp_data_unlock(sk);
}
EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
@@ -963,6 +999,10 @@ static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, in
if (get_user(len, optlen))
return -EFAULT;
+ /* When used only to check if a fallback to TCP happened. */
+ if (len == 0)
+ return 0;
+
len = min_t(unsigned int, len, sizeof(struct mptcp_info));
mptcp_diag_fill_info(msk, &m_info);
@@ -1331,6 +1371,8 @@ static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
char __user *optval, int __user *optlen)
{
+ struct sock *sk = (void *)msk;
+
switch (optname) {
case TCP_ULP:
case TCP_CONGESTION:
@@ -1349,6 +1391,22 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
return mptcp_put_int_option(msk, optval, optlen, msk->cork);
case TCP_NODELAY:
return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
+ case TCP_KEEPIDLE:
+ return mptcp_put_int_option(msk, optval, optlen,
+ msk->keepalive_idle ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_time) / HZ);
+ case TCP_KEEPINTVL:
+ return mptcp_put_int_option(msk, optval, optlen,
+ msk->keepalive_intvl ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_intvl) / HZ);
+ case TCP_KEEPCNT:
+ return mptcp_put_int_option(msk, optval, optlen,
+ msk->keepalive_cnt ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_probes));
+ case TCP_NOTSENT_LOWAT:
+ return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
+ case TCP_IS_MPTCP:
+ return mptcp_put_int_option(msk, optval, optlen, 1);
}
return -EOPNOTSUPP;
}
@@ -1395,7 +1453,7 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname,
struct mptcp_sock *msk = mptcp_sk(sk);
struct sock *ssk;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
/* @@ the meaning of setsockopt() when the socket is connected and
* there are multiple subflows is not yet defined. It is up to the
@@ -1464,6 +1522,9 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
tcp_set_congestion_control(ssk, msk->ca_name, false, true);
__tcp_sock_set_cork(ssk, !!msk->cork);
__tcp_sock_set_nodelay(ssk, !!msk->nodelay);
+ tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle);
+ tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl);
+ tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt);
inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk));
inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
@@ -1500,6 +1561,10 @@ int mptcp_set_rcvlowat(struct sock *sk, int val)
struct mptcp_subflow_context *subflow;
int space, cap;
+ /* bpf can land here with a wrong sk type */
+ if (sk->sk_protocol == IPPROTO_TCP)
+ return -EINVAL;
+
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
cap = sk->sk_rcvbuf >> 1;
else
@@ -1514,7 +1579,7 @@ int mptcp_set_rcvlowat(struct sock *sk, int val)
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
return 0;
- space = __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, val);
+ space = mptcp_space_from_win(sk, val);
if (space <= sk->sk_rcvbuf)
return 0;
@@ -1526,7 +1591,7 @@ int mptcp_set_rcvlowat(struct sock *sk, int val)
slow = lock_sock_fast(ssk);
WRITE_ONCE(ssk->sk_rcvbuf, space);
- tcp_sk(ssk)->window_clamp = val;
+ WRITE_ONCE(tcp_sk(ssk)->window_clamp, val);
unlock_sock_fast(ssk, slow);
}
return 0;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 71ba86246ff8..9f18217dddc8 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -15,13 +15,12 @@
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
#include <net/protocol.h>
-#include <net/tcp.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/ip6_route.h>
#include <net/transp_v6.h>
#endif
#include <net/mptcp.h>
-#include <uapi/linux/mptcp.h>
+
#include "protocol.h"
#include "mib.h"
@@ -40,7 +39,7 @@ static void subflow_req_destructor(struct request_sock *req)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
- pr_debug("subflow_req=%p", subflow_req);
+ pr_debug("subflow_req=%p\n", subflow_req);
if (subflow_req->msk)
sock_put((struct sock *)subflow_req->msk);
@@ -75,7 +74,8 @@ static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_
get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
- subflow_generate_hmac(msk->local_key, msk->remote_key,
+ subflow_generate_hmac(READ_ONCE(msk->local_key),
+ READ_ONCE(msk->remote_key),
subflow_req->local_nonce,
subflow_req->remote_nonce, hmac);
@@ -100,6 +100,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
return NULL;
}
subflow_req->local_id = local_id;
+ subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req);
return msk;
}
@@ -131,6 +132,13 @@ static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
}
}
+static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb)
+{
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT);
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
+ return -EPERM;
+}
+
/* Init mptcp request socket.
*
* Returns an error code if a JOIN has failed and a TCP reset
@@ -145,14 +153,16 @@ static int subflow_check_req(struct request_sock *req,
struct mptcp_options_received mp_opt;
bool opt_mp_capable, opt_mp_join;
- pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
+ pr_debug("subflow_req=%p, listener=%p\n", subflow_req, listener);
#ifdef CONFIG_TCP_MD5SIG
/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
* TCP option space.
*/
- if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
+ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) {
+ subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
return -EINVAL;
+ }
#endif
mptcp_get_options(skb, &mp_opt);
@@ -162,10 +172,17 @@ static int subflow_check_req(struct request_sock *req,
if (opt_mp_capable) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
+ if (unlikely(listener->pm_listener))
+ return subflow_reset_req_endp(req, skb);
if (opt_mp_join)
return 0;
} else if (opt_mp_join) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
+
+ if (mp_opt.backup)
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX);
+ } else if (unlikely(listener->pm_listener)) {
+ return subflow_reset_req_endp(req, skb);
}
if (opt_mp_capable && listener->request_mptcp) {
@@ -215,11 +232,12 @@ again:
}
if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
- pr_debug("syn inet_sport=%d %d",
+ pr_debug("syn inet_sport=%d %d\n",
ntohs(inet_sk(sk_listener)->inet_sport),
ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
return -EPERM;
}
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
@@ -228,13 +246,15 @@ again:
subflow_req_create_thmac(subflow_req);
if (unlikely(req->syncookie)) {
- if (mptcp_can_accept_new_subflow(subflow_req->msk))
- subflow_init_req_cookie_join_save(subflow_req, skb);
- else
+ if (!mptcp_can_accept_new_subflow(subflow_req->msk)) {
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
return -EPERM;
+ }
+
+ subflow_init_req_cookie_join_save(subflow_req, skb);
}
- pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
+ pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req->token,
subflow_req->remote_nonce, subflow_req->msk);
}
@@ -282,10 +302,21 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
}
EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
+static enum sk_rst_reason mptcp_get_rst_reason(const struct sk_buff *skb)
+{
+ const struct mptcp_ext *mpext = mptcp_get_ext(skb);
+
+ if (!mpext)
+ return SK_RST_REASON_NOT_SPECIFIED;
+
+ return sk_rst_convert_mptcp_reason(mpext->reset_reason);
+}
+
static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
struct sk_buff *skb,
struct flowi *fl,
- struct request_sock *req)
+ struct request_sock *req,
+ u32 tw_isn)
{
struct dst_entry *dst;
int err;
@@ -293,7 +324,7 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
tcp_rsk(req)->is_mptcp = 1;
subflow_init_req(req, sk);
- dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
+ dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn);
if (!dst)
return NULL;
@@ -303,7 +334,8 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
dst_release(dst);
if (!req->syncookie)
- tcp_request_sock_ops.send_reset(sk, skb);
+ tcp_request_sock_ops.send_reset(sk, skb,
+ mptcp_get_rst_reason(skb));
return NULL;
}
@@ -352,7 +384,8 @@ static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
struct sk_buff *skb,
struct flowi *fl,
- struct request_sock *req)
+ struct request_sock *req,
+ u32 tw_isn)
{
struct dst_entry *dst;
int err;
@@ -360,7 +393,7 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
tcp_rsk(req)->is_mptcp = 1;
subflow_init_req(req, sk);
- dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
+ dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn);
if (!dst)
return NULL;
@@ -370,7 +403,8 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
dst_release(dst);
if (!req->syncookie)
- tcp6_request_sock_ops.send_reset(sk, skb);
+ tcp6_request_sock_ops.send_reset(sk, skb,
+ mptcp_get_rst_reason(skb));
return NULL;
}
#endif
@@ -406,7 +440,7 @@ void mptcp_subflow_reset(struct sock *ssk)
/* must hold: tcp_done() could drop last reference on parent */
sock_hold(sk);
- tcp_send_active_reset(ssk, GFP_ATOMIC);
+ mptcp_send_active_reset_reason(ssk);
tcp_done(ssk);
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags))
mptcp_schedule_work(sk);
@@ -504,7 +538,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->rel_write_seq = 1;
subflow->conn_finished = 1;
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
- pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
+ pr_debug("subflow=%p synack seq=%x\n", subflow, subflow->ssn_offset);
mptcp_get_options(skb, &mp_opt);
if (subflow->request_mptcp) {
@@ -523,6 +557,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->mp_capable = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
mptcp_finish_connect(sk);
+ mptcp_active_enable(parent);
mptcp_propagate_state(parent, sk, subflow, &mp_opt);
} else if (subflow->request_join) {
u8 hmac[SHA256_DIGEST_SIZE];
@@ -536,7 +571,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->thmac = mp_opt.thmac;
subflow->remote_nonce = mp_opt.nonce;
WRITE_ONCE(subflow->remote_id, mp_opt.join_id);
- pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d",
+ pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n",
subflow, subflow->thmac, subflow->remote_nonce,
subflow->backup);
@@ -558,13 +593,19 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->mp_join = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
+ if (subflow->backup)
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX);
+
if (subflow_use_different_dport(msk, sk)) {
- pr_debug("synack inet_dport=%d %d",
+ pr_debug("synack inet_dport=%d %d\n",
ntohs(inet_sk(sk)->inet_dport),
ntohs(inet_sk(parent)->inet_dport));
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
}
} else if (mptcp_check_fallback(sk)) {
+ /* It looks like MPTCP is blocked, while TCP is not */
+ if (subflow->mpc_drop)
+ mptcp_active_disable(parent);
fallback:
mptcp_propagate_state(parent, sk, subflow, NULL);
}
@@ -595,6 +636,8 @@ static int subflow_chk_local_id(struct sock *sk)
return err;
subflow_set_local_id(subflow, err);
+ subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk);
+
return 0;
}
@@ -627,7 +670,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- pr_debug("subflow=%p", subflow);
+ pr_debug("subflow=%p\n", subflow);
/* Never answer to SYNs sent to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -658,7 +701,7 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- pr_debug("subflow=%p", subflow);
+ pr_debug("subflow=%p\n", subflow);
if (skb->protocol == htons(ETH_P_IP))
return subflow_v4_conn_request(sk, skb);
@@ -714,7 +757,8 @@ static bool subflow_hmac_valid(const struct request_sock *req,
if (!msk)
return false;
- subflow_generate_hmac(msk->remote_key, msk->local_key,
+ subflow_generate_hmac(READ_ONCE(msk->remote_key),
+ READ_ONCE(msk->local_key),
subflow_req->remote_nonce,
subflow_req->local_nonce, hmac);
@@ -756,7 +800,7 @@ void __mptcp_subflow_fully_established(struct mptcp_sock *msk,
const struct mptcp_options_received *mp_opt)
{
subflow_set_remote_key(msk, subflow, mp_opt);
- subflow->fully_established = 1;
+ WRITE_ONCE(subflow->fully_established, true);
WRITE_ONCE(msk->fully_established, true);
if (subflow->is_mptfo)
@@ -774,10 +818,11 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct mptcp_subflow_request_sock *subflow_req;
struct mptcp_options_received mp_opt;
bool fallback, fallback_is_fatal;
+ enum sk_rst_reason reason;
struct mptcp_sock *owner;
struct sock *child;
- pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
+ pr_debug("listener=%p, req=%p, conn=%p\n", listener, req, listener->conn);
/* After child creation we must look for MPC even when options
* are not parsed
@@ -868,18 +913,23 @@ create_child:
ctx->conn = (struct sock *)owner;
if (subflow_use_different_sport(owner, sk)) {
- pr_debug("ack inet_sport=%d %d",
+ pr_debug("ack inet_sport=%d %d\n",
ntohs(inet_sk(sk)->inet_sport),
ntohs(inet_sk((struct sock *)owner)->inet_sport));
if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
goto dispose_child;
}
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
}
- if (!mptcp_finish_join(child))
+ if (!mptcp_finish_join(child)) {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(child);
+
+ subflow_add_reset_reason(skb, subflow->reset_reason);
goto dispose_child;
+ }
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
tcp_rsk(req)->drop_req = true;
@@ -887,7 +937,7 @@ create_child:
}
/* check for expected invariant - should never trigger, just help
- * catching eariler subtle bugs
+ * catching earlier subtle bugs
*/
WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
(!mptcp_subflow_ctx(child) ||
@@ -899,12 +949,15 @@ dispose_child:
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
- req->rsk_ops->send_reset(sk, skb);
+ reason = mptcp_get_rst_reason(skb);
+ req->rsk_ops->send_reset(sk, skb, reason);
/* The last child reference will be released by the caller */
return child;
fallback:
+ if (fallback)
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
mptcp_subflow_drop_ctx(child);
return child;
}
@@ -918,12 +971,13 @@ enum mapping_status {
MAPPING_EMPTY,
MAPPING_DATA_FIN,
MAPPING_DUMMY,
- MAPPING_BAD_CSUM
+ MAPPING_BAD_CSUM,
+ MAPPING_NODSS
};
static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
{
- pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
+ pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n",
ssn, subflow->map_subflow_seq, subflow->map_data_len);
}
@@ -933,8 +987,10 @@ static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
unsigned int skb_consumed;
skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
- if (WARN_ON_ONCE(skb_consumed >= skb->len))
+ if (unlikely(skb_consumed >= skb->len)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return true;
+ }
return skb->len - skb_consumed <= subflow->map_data_len -
mptcp_subflow_get_map_offset(subflow);
@@ -1073,8 +1129,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
return MAPPING_EMPTY;
}
+ /* If the required DSS has likely been dropped by a middlebox */
if (!subflow->map_valid)
- return MAPPING_INVALID;
+ return MAPPING_NODSS;
goto validate_seq;
}
@@ -1083,17 +1140,18 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
data_len = mpext->data_len;
if (data_len == 0) {
- pr_debug("infinite mapping received");
+ pr_debug("infinite mapping received\n");
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
- subflow->map_data_len = 0;
return MAPPING_INVALID;
}
if (mpext->data_fin == 1) {
+ u64 data_fin_seq;
+
if (data_len == 1) {
bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
mpext->dsn64);
- pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
+ pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq);
if (subflow->map_valid) {
/* A DATA_FIN might arrive in a DSS
* option before the previous mapping
@@ -1102,26 +1160,26 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
*/
skb_ext_del(skb, SKB_EXT_MPTCP);
return MAPPING_OK;
- } else {
- if (updated)
- mptcp_schedule_work((struct sock *)msk);
-
- return MAPPING_DATA_FIN;
}
- } else {
- u64 data_fin_seq = mpext->data_seq + data_len - 1;
- /* If mpext->data_seq is a 32-bit value, data_fin_seq
- * must also be limited to 32 bits.
- */
- if (!mpext->dsn64)
- data_fin_seq &= GENMASK_ULL(31, 0);
+ if (updated)
+ mptcp_schedule_work((struct sock *)msk);
- mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
- pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
- data_fin_seq, mpext->dsn64);
+ return MAPPING_DATA_FIN;
}
+ data_fin_seq = mpext->data_seq + data_len - 1;
+
+ /* If mpext->data_seq is a 32-bit value, data_fin_seq must also
+ * be limited to 32 bits.
+ */
+ if (!mpext->dsn64)
+ data_fin_seq &= GENMASK_ULL(31, 0);
+
+ mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
+ pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n",
+ data_fin_seq, mpext->dsn64);
+
/* Adjust for DATA_FIN using 1 byte of sequence space */
data_len--;
}
@@ -1165,7 +1223,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
if (unlikely(subflow->map_csum_reqd != csum_reqd))
return MAPPING_INVALID;
- pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
+ pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n",
subflow->map_seq, subflow->map_subflow_seq,
subflow->map_data_len, subflow->map_csum_reqd,
subflow->map_data_csum);
@@ -1190,14 +1248,22 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
- u32 incr;
+ struct tcp_sock *tp = tcp_sk(ssk);
+ u32 offset, incr, avail_len;
- incr = limit >= skb->len ? skb->len + fin : limit;
+ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
+ if (WARN_ON_ONCE(offset > skb->len))
+ goto out;
- pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
- subflow->map_subflow_seq);
+ avail_len = skb->len - offset;
+ incr = limit >= avail_len ? avail_len + fin : limit;
+
+ pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr, skb->len,
+ offset, subflow->map_subflow_seq);
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
tcp_sk(ssk)->copied_seq += incr;
+
+out:
if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
sk_eat_skb(ssk, skb);
if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
@@ -1207,24 +1273,16 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
/* sched mptcp worker to remove the subflow if no more data is pending */
static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
{
- if (likely(ssk->sk_state != TCP_CLOSE))
+ struct sock *sk = (struct sock *)msk;
+
+ if (likely(ssk->sk_state != TCP_CLOSE &&
+ (ssk->sk_state != TCP_CLOSE_WAIT ||
+ inet_sk_state_load(sk) != TCP_ESTABLISHED)))
return;
if (skb_queue_empty(&ssk->sk_receive_queue) &&
!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
- mptcp_schedule_work((struct sock *)msk);
-}
-
-static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
-{
- struct mptcp_sock *msk = mptcp_sk(subflow->conn);
-
- if (subflow->mp_join)
- return false;
- else if (READ_ONCE(msk->csum_enabled))
- return !subflow->valid_csum_seen;
- else
- return !subflow->fully_established;
+ mptcp_schedule_work(sk);
}
static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
@@ -1232,7 +1290,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
unsigned long fail_tout;
- /* greceful failure can happen only on the MPC subflow */
+ /* graceful failure can happen only on the MPC subflow */
if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
return;
@@ -1274,7 +1332,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
status = get_mapping_status(ssk, msk);
trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue));
if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY ||
- status == MAPPING_BAD_CSUM))
+ status == MAPPING_BAD_CSUM || status == MAPPING_NODSS))
goto fallback;
if (status != MAPPING_OK)
@@ -1289,7 +1347,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
old_ack = READ_ONCE(msk->ack_seq);
ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
- pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
+ pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack,
ack_seq);
if (unlikely(before64(ack_seq, old_ack))) {
mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
@@ -1322,19 +1380,21 @@ fallback:
return true;
}
- if (!subflow_can_fallback(subflow) && subflow->map_data_len) {
+ if (!READ_ONCE(msk->allow_infinite_fallback)) {
/* fatal protocol error, close the socket.
* subflow_error_report() will introduce the appropriate barriers
*/
subflow->reset_transient = 0;
- subflow->reset_reason = MPTCP_RST_EMPTCP;
+ subflow->reset_reason = status == MAPPING_NODSS ?
+ MPTCP_RST_EMIDDLEBOX :
+ MPTCP_RST_EMPTCP;
reset:
WRITE_ONCE(ssk->sk_err, EBADMSG);
tcp_set_state(ssk, TCP_CLOSE);
while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb);
- tcp_send_active_reset(ssk, GFP_ATOMIC);
+ mptcp_send_active_reset_reason(ssk);
WRITE_ONCE(subflow->data_avail, false);
return false;
}
@@ -1361,7 +1421,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
subflow->map_valid = 0;
WRITE_ONCE(subflow->data_avail, false);
- pr_debug("Done with mapping: seq=%u data_len=%u",
+ pr_debug("Done with mapping: seq=%u data_len=%u\n",
subflow->map_subflow_seq,
subflow->map_data_len);
}
@@ -1471,7 +1531,7 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
- pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
+ pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n",
subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
if (likely(icsk->icsk_af_ops == target))
@@ -1513,28 +1573,31 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
#endif
}
-int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local,
const struct mptcp_addr_info *remote)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_subflow_context *subflow;
+ int local_id = local->addr.id;
struct sockaddr_storage addr;
int remote_id = remote->id;
- int local_id = loc->id;
int err = -ENOTCONN;
struct socket *sf;
struct sock *ssk;
u32 remote_token;
int addrlen;
- int ifindex;
- u8 flags;
+ /* The userspace PM sent the request too early? */
if (!mptcp_is_fully_established(sk))
goto err_out;
- err = mptcp_subflow_create_socket(sk, loc->family, &sf);
- if (err)
+ err = mptcp_subflow_create_socket(sk, local->addr.family, &sf);
+ if (err) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCREATSKERR);
+ pr_debug("msk=%p local=%d remote=%d create sock error: %d\n",
+ msk, local_id, remote_id, err);
goto err_out;
+ }
ssk = sf->sk;
subflow = mptcp_subflow_ctx(ssk);
@@ -1542,42 +1605,61 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
get_random_bytes(&subflow->local_nonce, sizeof(u32));
} while (!subflow->local_nonce);
- if (local_id)
+ /* if 'IPADDRANY', the ID will be set later, after the routing */
+ if (local->addr.family == AF_INET) {
+ if (!local->addr.addr.s_addr)
+ local_id = -1;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ if (ipv6_addr_any(&local->addr.addr6))
+ local_id = -1;
+#endif
+ }
+
+ if (local_id >= 0)
subflow_set_local_id(subflow, local_id);
- mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
- &flags, &ifindex);
subflow->remote_key_valid = 1;
- subflow->remote_key = msk->remote_key;
- subflow->local_key = msk->local_key;
+ subflow->remote_key = READ_ONCE(msk->remote_key);
+ subflow->local_key = READ_ONCE(msk->local_key);
subflow->token = msk->token;
- mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
+ mptcp_info2sockaddr(&local->addr, &addr, ssk->sk_family);
addrlen = sizeof(struct sockaddr_in);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (addr.ss_family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
- ssk->sk_bound_dev_if = ifindex;
+ ssk->sk_bound_dev_if = local->ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
- if (err)
+ if (err) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR);
+ pr_debug("msk=%p local=%d remote=%d bind error: %d\n",
+ msk, local_id, remote_id, err);
goto failed;
+ }
mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
- pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
+ pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk,
remote_token, local_id, remote_id);
subflow->remote_token = remote_token;
WRITE_ONCE(subflow->remote_id, remote_id);
subflow->request_join = 1;
- subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ subflow->request_bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
subflow->subflow_id = msk->subflow_id++;
mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
sock_hold(ssk);
list_add_tail(&subflow->node, &msk->conn_list);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
- if (err && err != -EINPROGRESS)
+ if (err && err != -EINPROGRESS) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR);
+ pr_debug("msk=%p local=%d remote=%d connect error: %d\n",
+ msk, local_id, remote_id, err);
goto failed_unlink;
+ }
+
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTX);
/* discard the subflow socket */
mptcp_sock_graft(ssk, sk->sk_socket);
@@ -1677,10 +1759,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
* needs it.
* Update ns_tracker to current stack trace and refcounted tracker.
*/
- __netns_tracker_free(net, &sf->sk->ns_tracker, false);
- sf->sk->sk_net_refcnt = 1;
- get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
- sock_inuse_add(net, 1);
+ sk_net_refcnt_upgrade(sf->sk);
err = tcp_set_ulp(sf->sk, "mptcp");
if (err)
goto err_free;
@@ -1688,7 +1767,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk);
release_sock(sf->sk);
- /* the newly created socket really belongs to the owning MPTCP master
+ /* the newly created socket really belongs to the owning MPTCP
* socket, even if for additional subflows the allocation is performed
* by a kernel workqueue. Adjust inode references, so that the
* procfs/diag interfaces really show this one belonging to the correct
@@ -1699,7 +1778,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
subflow = mptcp_subflow_ctx(sf->sk);
- pr_debug("subflow=%p", subflow);
+ pr_debug("subflow=%p\n", subflow);
*new_sock = sf;
sock_hold(sk);
@@ -1728,7 +1807,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
INIT_LIST_HEAD(&ctx->node);
INIT_LIST_HEAD(&ctx->delegated_node);
- pr_debug("subflow=%p", ctx);
+ pr_debug("subflow=%p\n", ctx);
ctx->tcp_sock = sk;
WRITE_ONCE(ctx->local_id, -1);
@@ -1879,7 +1958,7 @@ static int subflow_ulp_init(struct sock *sk)
goto out;
}
- pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
+ pr_debug("subflow=%p, family=%d\n", ctx, sk->sk_family);
tp->is_mptcp = 1;
ctx->icsk_af_ops = icsk->icsk_af_ops;
@@ -1954,7 +2033,6 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->tcp_state_change = old_ctx->tcp_state_change;
new_ctx->tcp_error_report = old_ctx->tcp_error_report;
new_ctx->rel_write_seq = 1;
- new_ctx->tcp_sock = newsk;
if (subflow_req->mp_capable) {
/* see comments in subflow_syn_recv_sock(), MPTCP connection
@@ -1971,9 +2049,10 @@ static void subflow_ulp_clone(const struct request_sock *req,
} else if (subflow_req->mp_join) {
new_ctx->ssn_offset = subflow_req->ssn_offset;
new_ctx->mp_join = 1;
- new_ctx->fully_established = 1;
+ WRITE_ONCE(new_ctx->fully_established, true);
new_ctx->remote_key_valid = 1;
new_ctx->backup = subflow_req->backup;
+ new_ctx->request_bkup = subflow_req->request_bkup;
WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id);
new_ctx->token = subflow_req->token;
new_ctx->thmac = subflow_req->thmac;
diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c
index bfff53e668da..4fc39fa2e262 100644
--- a/net/mptcp/token_test.c
+++ b/net/mptcp/token_test.c
@@ -52,14 +52,19 @@ static struct mptcp_subflow_context *build_ctx(struct kunit *test)
static struct mptcp_sock *build_msk(struct kunit *test)
{
struct mptcp_sock *msk;
+ struct sock *sk;
msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER);
KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk);
refcount_set(&((struct sock *)msk)->sk_refcnt, 1);
sock_net_set((struct sock *)msk, &init_net);
+ sk = (struct sock *)msk;
+
/* be sure the token helpers can dereference sk->sk_prot */
- ((struct sock *)msk)->sk_prot = &tcp_prot;
+ sk->sk_prot = &tcp_prot;
+ sk->sk_protocol = IPPROTO_MPTCP;
+
return msk;
}