summaryrefslogtreecommitdiff
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/Makefile3
-rw-r--r--net/mptcp/bpf.c15
-rw-r--r--net/mptcp/crypto.c35
-rw-r--r--net/mptcp/crypto_test.c1
-rw-r--r--net/mptcp/ctrl.c369
-rw-r--r--net/mptcp/diag.c54
-rw-r--r--net/mptcp/fastopen.c45
-rw-r--r--net/mptcp/mib.c39
-rw-r--r--net/mptcp/mib.h50
-rw-r--r--net/mptcp/mptcp_diag.c21
-rw-r--r--net/mptcp/mptcp_pm_gen.c180
-rw-r--r--net/mptcp/mptcp_pm_gen.h59
-rw-r--r--net/mptcp/options.c238
-rw-r--r--net/mptcp/pm.c833
-rw-r--r--net/mptcp/pm_kernel.c1624
-rw-r--r--net/mptcp/pm_netlink.c1909
-rw-r--r--net/mptcp/pm_userspace.c639
-rw-r--r--net/mptcp/protocol.c2735
-rw-r--r--net/mptcp/protocol.h654
-rw-r--r--net/mptcp/sched.c215
-rw-r--r--net/mptcp/sockopt.c600
-rw-r--r--net/mptcp/subflow.c867
-rw-r--r--net/mptcp/token.c14
-rw-r--r--net/mptcp/token_test.c9
24 files changed, 7125 insertions, 4083 deletions
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index a3829ce548f9..89bf6c47c818 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -2,7 +2,8 @@
obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
- mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o
+ mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o \
+ mptcp_pm_gen.o pm_kernel.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
index 5a0a84ad94af..8a16672b94e2 100644
--- a/net/mptcp/bpf.c
+++ b/net/mptcp/bpf.c
@@ -19,3 +19,18 @@ struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk)
return NULL;
}
+
+BTF_SET8_START(bpf_mptcp_fmodret_ids)
+BTF_ID_FLAGS(func, update_socket_protocol)
+BTF_SET8_END(bpf_mptcp_fmodret_ids)
+
+static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_mptcp_fmodret_ids,
+};
+
+static int __init bpf_mptcp_kfunc_init(void)
+{
+ return register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set);
+}
+late_initcall(bpf_mptcp_kfunc_init);
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index a8931349933c..31948e18d97d 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -22,7 +22,6 @@
#include <linux/kernel.h>
#include <crypto/sha2.h>
-#include <asm/unaligned.h>
#include "protocol.h"
@@ -43,39 +42,9 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
{
- u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
- u8 key1be[8];
- u8 key2be[8];
- int i;
+ __be64 key[2] = { cpu_to_be64(key1), cpu_to_be64(key2) };
- if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE))
- len = SHA256_DIGEST_SIZE;
-
- put_unaligned_be64(key1, key1be);
- put_unaligned_be64(key2, key2be);
-
- /* Generate key xored with ipad */
- memset(input, 0x36, SHA256_BLOCK_SIZE);
- for (i = 0; i < 8; i++)
- input[i] ^= key1be[i];
- for (i = 0; i < 8; i++)
- input[i + 8] ^= key2be[i];
-
- memcpy(&input[SHA256_BLOCK_SIZE], msg, len);
-
- /* emit sha256(K1 || msg) on the second input block, so we can
- * reuse 'input' for the last hashing
- */
- sha256(input, SHA256_BLOCK_SIZE + len, &input[SHA256_BLOCK_SIZE]);
-
- /* Prepare second part of hmac */
- memset(input, 0x5C, SHA256_BLOCK_SIZE);
- for (i = 0; i < 8; i++)
- input[i] ^= key1be[i];
- for (i = 0; i < 8; i++)
- input[i + 8] ^= key2be[i];
-
- sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac);
+ hmac_sha256_usingrawkey((const u8 *)key, sizeof(key), msg, len, hmac);
}
#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST)
diff --git a/net/mptcp/crypto_test.c b/net/mptcp/crypto_test.c
index 017248dea038..220414e5c850 100644
--- a/net/mptcp/crypto_test.c
+++ b/net/mptcp/crypto_test.c
@@ -70,3 +70,4 @@ static struct kunit_suite mptcp_crypto_suite = {
kunit_test_suite(mptcp_crypto_suite);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("KUnit tests for MPTCP Crypto");
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index ae20b7d92e28..d96130e49942 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -12,6 +12,7 @@
#include <net/netns/generic.h>
#include "protocol.h"
+#include "mib.h"
#define MPTCP_SYSCTL_PATH "net/mptcp"
@@ -27,11 +28,18 @@ struct mptcp_pernet {
#endif
unsigned int add_addr_timeout;
+ unsigned int blackhole_timeout;
+ unsigned int close_timeout;
unsigned int stale_loss_cnt;
+ atomic_t active_disable_times;
+ u8 syn_retrans_before_tcp_fallback;
+ unsigned long active_disable_stamp;
u8 mptcp_enabled;
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
u8 pm_type;
+ char scheduler[MPTCP_SCHED_NAME_MAX];
+ char path_manager[MPTCP_PM_NAME_MAX];
};
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
@@ -64,22 +72,205 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net)
return mptcp_get_pernet(net)->stale_loss_cnt;
}
+unsigned int mptcp_close_timeout(const struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_DEAD))
+ return TCP_TIMEWAIT_LEN;
+ return mptcp_get_pernet(sock_net(sk))->close_timeout;
+}
+
int mptcp_get_pm_type(const struct net *net)
{
return mptcp_get_pernet(net)->pm_type;
}
+const char *mptcp_get_path_manager(const struct net *net)
+{
+ return mptcp_get_pernet(net)->path_manager;
+}
+
+const char *mptcp_get_scheduler(const struct net *net)
+{
+ return mptcp_get_pernet(net)->scheduler;
+}
+
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
pernet->add_addr_timeout = TCP_RTO_MAX;
+ pernet->blackhole_timeout = 3600;
+ pernet->syn_retrans_before_tcp_fallback = 2;
+ atomic_set(&pernet->active_disable_times, 0);
+ pernet->close_timeout = TCP_TIMEWAIT_LEN;
pernet->checksum_enabled = 0;
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
+ strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler));
+ strscpy(pernet->path_manager, "kernel", sizeof(pernet->path_manager));
}
#ifdef CONFIG_SYSCTL
+static int mptcp_set_scheduler(char *scheduler, const char *name)
+{
+ struct mptcp_sched_ops *sched;
+ int ret = 0;
+
+ rcu_read_lock();
+ sched = mptcp_sched_find(name);
+ if (sched)
+ strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX);
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int proc_scheduler(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data;
+ char val[MPTCP_SCHED_NAME_MAX];
+ struct ctl_table tbl = {
+ .data = val,
+ .maxlen = MPTCP_SCHED_NAME_MAX,
+ };
+ int ret;
+
+ strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = mptcp_set_scheduler(*scheduler, val);
+
+ return ret;
+}
+
+static int proc_available_schedulers(const struct ctl_table *ctl,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = MPTCP_SCHED_BUF_MAX, };
+ int ret;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+ if (!tbl.data)
+ return -ENOMEM;
+
+ mptcp_get_available_schedulers(tbl.data, MPTCP_SCHED_BUF_MAX);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ kfree(tbl.data);
+
+ return ret;
+}
+
+static int proc_blackhole_detect_timeout(const struct ctl_table *table,
+ int write, void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct mptcp_pernet *pernet = container_of(table->data,
+ struct mptcp_pernet,
+ blackhole_timeout);
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ atomic_set(&pernet->active_disable_times, 0);
+
+ return ret;
+}
+
+static int mptcp_set_path_manager(char *path_manager, const char *name)
+{
+ struct mptcp_pm_ops *pm_ops;
+ int ret = 0;
+
+ rcu_read_lock();
+ pm_ops = mptcp_pm_find(name);
+ if (pm_ops)
+ strscpy(path_manager, name, MPTCP_PM_NAME_MAX);
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int proc_path_manager(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct mptcp_pernet *pernet = container_of(ctl->data,
+ struct mptcp_pernet,
+ path_manager);
+ char (*path_manager)[MPTCP_PM_NAME_MAX] = ctl->data;
+ char pm_name[MPTCP_PM_NAME_MAX];
+ const struct ctl_table tbl = {
+ .data = pm_name,
+ .maxlen = MPTCP_PM_NAME_MAX,
+ };
+ int ret;
+
+ strscpy(pm_name, *path_manager, MPTCP_PM_NAME_MAX);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0) {
+ ret = mptcp_set_path_manager(*path_manager, pm_name);
+ if (ret == 0) {
+ u8 pm_type = __MPTCP_PM_TYPE_NR;
+
+ if (strncmp(pm_name, "kernel", MPTCP_PM_NAME_MAX) == 0)
+ pm_type = MPTCP_PM_TYPE_KERNEL;
+ else if (strncmp(pm_name, "userspace", MPTCP_PM_NAME_MAX) == 0)
+ pm_type = MPTCP_PM_TYPE_USERSPACE;
+ pernet->pm_type = pm_type;
+ }
+ }
+
+ return ret;
+}
+
+static int proc_pm_type(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct mptcp_pernet *pernet = container_of(ctl->data,
+ struct mptcp_pernet,
+ pm_type);
+ int ret;
+
+ ret = proc_dou8vec_minmax(ctl, write, buffer, lenp, ppos);
+ if (write && ret == 0) {
+ u8 pm_type = READ_ONCE(*(u8 *)ctl->data);
+ char *pm_name = "";
+
+ if (pm_type == MPTCP_PM_TYPE_KERNEL)
+ pm_name = "kernel";
+ else if (pm_type == MPTCP_PM_TYPE_USERSPACE)
+ pm_name = "userspace";
+ mptcp_set_path_manager(pernet->path_manager, pm_name);
+ }
+
+ return ret;
+}
+
+static int proc_available_path_managers(const struct ctl_table *ctl,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = MPTCP_PM_BUF_MAX, };
+ int ret;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+ if (!tbl.data)
+ return -ENOMEM;
+
+ mptcp_pm_get_available(tbl.data, MPTCP_PM_BUF_MAX);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ kfree(tbl.data);
+
+ return ret;
+}
+
static struct ctl_table mptcp_sysctl_table[] = {
{
.procname = "enabled",
@@ -124,11 +315,53 @@ static struct ctl_table mptcp_sysctl_table[] = {
.procname = "pm_type",
.maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dou8vec_minmax,
+ .proc_handler = proc_pm_type,
.extra1 = SYSCTL_ZERO,
.extra2 = &mptcp_pm_type_max
},
- {}
+ {
+ .procname = "scheduler",
+ .maxlen = MPTCP_SCHED_NAME_MAX,
+ .mode = 0644,
+ .proc_handler = proc_scheduler,
+ },
+ {
+ .procname = "available_schedulers",
+ .maxlen = MPTCP_SCHED_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_available_schedulers,
+ },
+ {
+ .procname = "close_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "blackhole_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_blackhole_detect_timeout,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "syn_retrans_before_tcp_fallback",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "path_manager",
+ .maxlen = MPTCP_PM_NAME_MAX,
+ .mode = 0644,
+ .proc_handler = proc_path_manager,
+ },
+ {
+ .procname = "available_path_managers",
+ .maxlen = MPTCP_PM_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_available_path_managers,
+ },
};
static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
@@ -149,8 +382,16 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt;
table[5].data = &pernet->pm_type;
-
- hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
+ table[6].data = &pernet->scheduler;
+ /* table[7] is for available_schedulers which is read-only info */
+ table[8].data = &pernet->close_timeout;
+ table[9].data = &pernet->blackhole_timeout;
+ table[10].data = &pernet->syn_retrans_before_tcp_fallback;
+ table[11].data = &pernet->path_manager;
+ /* table[12] is for available_path_managers which is read-only info */
+
+ hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
+ ARRAY_SIZE(mptcp_sysctl_table));
if (!hdr)
goto err_reg;
@@ -167,7 +408,7 @@ err_alloc:
static void mptcp_pernet_del_table(struct mptcp_pernet *pernet)
{
- struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;
+ const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;
unregister_net_sysctl_table(pernet->ctl_table_hdr);
@@ -185,6 +426,124 @@ static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {}
#endif /* CONFIG_SYSCTL */
+/* The following code block is to deal with middle box issues with MPTCP,
+ * similar to what is done with TFO.
+ * The proposed solution is to disable active MPTCP globally when SYN+MPC are
+ * dropped, while SYN without MPC aren't. In this case, active side MPTCP is
+ * disabled globally for 1hr at first. Then if it happens again, it is disabled
+ * for 2h, then 4h, 8h, ...
+ * The timeout is reset back to 1hr when a successful active MPTCP connection is
+ * fully established.
+ */
+
+/* Disable active MPTCP and record current jiffies and active_disable_times */
+void mptcp_active_disable(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct mptcp_pernet *pernet;
+
+ pernet = mptcp_get_pernet(net);
+
+ if (!READ_ONCE(pernet->blackhole_timeout))
+ return;
+
+ /* Paired with READ_ONCE() in mptcp_active_should_disable() */
+ WRITE_ONCE(pernet->active_disable_stamp, jiffies);
+
+ /* Paired with smp_rmb() in mptcp_active_should_disable().
+ * We want pernet->active_disable_stamp to be updated first.
+ */
+ smp_mb__before_atomic();
+ atomic_inc(&pernet->active_disable_times);
+
+ MPTCP_INC_STATS(net, MPTCP_MIB_BLACKHOLE);
+}
+
+/* Calculate timeout for MPTCP active disable
+ * Return true if we are still in the active MPTCP disable period
+ * Return false if timeout already expired and we should use active MPTCP
+ */
+bool mptcp_active_should_disable(struct sock *ssk)
+{
+ struct net *net = sock_net(ssk);
+ unsigned int blackhole_timeout;
+ struct mptcp_pernet *pernet;
+ unsigned long timeout;
+ int disable_times;
+ int multiplier;
+
+ pernet = mptcp_get_pernet(net);
+ blackhole_timeout = READ_ONCE(pernet->blackhole_timeout);
+
+ if (!blackhole_timeout)
+ return false;
+
+ disable_times = atomic_read(&pernet->active_disable_times);
+ if (!disable_times)
+ return false;
+
+ /* Paired with smp_mb__before_atomic() in mptcp_active_disable() */
+ smp_rmb();
+
+ /* Limit timeout to max: 2^6 * initial timeout */
+ multiplier = 1 << min(disable_times - 1, 6);
+
+ /* Paired with the WRITE_ONCE() in mptcp_active_disable(). */
+ timeout = READ_ONCE(pernet->active_disable_stamp) +
+ multiplier * blackhole_timeout * HZ;
+
+ return time_before(jiffies, timeout);
+}
+
+/* Enable active MPTCP and reset active_disable_times if needed */
+void mptcp_active_enable(struct sock *sk)
+{
+ struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk));
+
+ if (atomic_read(&pernet->active_disable_times)) {
+ struct net_device *dev;
+ struct dst_entry *dst;
+
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
+ if (!(dev && (dev->flags & IFF_LOOPBACK)))
+ atomic_set(&pernet->active_disable_times, 0);
+ rcu_read_unlock();
+ }
+}
+
+/* Check the number of retransmissions, and fallback to TCP if needed */
+void mptcp_active_detect_blackhole(struct sock *ssk, bool expired)
+{
+ struct mptcp_subflow_context *subflow;
+ u8 timeouts, to_max;
+ struct net *net;
+
+ /* Only check MPTCP SYN ... */
+ if (likely(!sk_is_mptcp(ssk) || ssk->sk_state != TCP_SYN_SENT))
+ return;
+
+ subflow = mptcp_subflow_ctx(ssk);
+
+ /* ... + MP_CAPABLE */
+ if (!subflow->request_mptcp) {
+ /* Mark as blackhole iif the 1st non-MPTCP SYN is accepted */
+ subflow->mpc_drop = 0;
+ return;
+ }
+
+ net = sock_net(ssk);
+ timeouts = inet_csk(ssk)->icsk_retransmits;
+ to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback;
+
+ if (timeouts == to_max || (timeouts < to_max && expired)) {
+ subflow->mpc_drop = 1;
+ mptcp_early_fallback(mptcp_sk(subflow->conn), subflow,
+ MPTCP_MIB_MPCAPABLEACTIVEDROP);
+ }
+}
+
static int __net_init mptcp_net_init(struct net *net)
{
struct mptcp_pernet *pernet = mptcp_get_pernet(net);
diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
index a536586742f2..70cf9ebce833 100644
--- a/net/mptcp/diag.c
+++ b/net/mptcp/diag.c
@@ -10,20 +10,24 @@
#include <linux/net.h>
#include <linux/inet_diag.h>
#include <net/netlink.h>
-#include <uapi/linux/mptcp.h>
#include "protocol.h"
-static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
+static int subflow_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
{
struct mptcp_subflow_context *sf;
struct nlattr *start;
u32 flags = 0;
+ bool slow;
int err;
+ if (inet_sk_state_load(sk) == TCP_LISTEN)
+ return 0;
+
start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
if (!start)
return -EMSGSIZE;
+ slow = lock_sock_fast(sk);
rcu_read_lock();
sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data);
if (!sf) {
@@ -43,7 +47,7 @@ static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM;
if (sf->request_bkup)
flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC;
- if (sf->fully_established)
+ if (READ_ONCE(sf->fully_established))
flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED;
if (sf->conn_finished)
flags |= MPTCP_SUBFLOW_FLAG_CONNECTED;
@@ -52,48 +56,60 @@ static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) ||
nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) ||
- nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
- sf->rel_write_seq) ||
- nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq,
- MPTCP_SUBFLOW_ATTR_PAD) ||
- nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
- sf->map_subflow_seq) ||
- nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) ||
- nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
- sf->map_data_len) ||
nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) ||
nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) ||
- nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) {
+ nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, subflow_get_local_id(sf))) {
+ err = -EMSGSIZE;
+ goto nla_failure;
+ }
+
+ /* Only export seq related counters to user with CAP_NET_ADMIN */
+ if (net_admin &&
+ (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
+ sf->rel_write_seq) ||
+ nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq,
+ MPTCP_SUBFLOW_ATTR_PAD) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
+ sf->map_subflow_seq) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) ||
+ nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
+ sf->map_data_len))) {
err = -EMSGSIZE;
goto nla_failure;
}
rcu_read_unlock();
+ unlock_sock_fast(sk, slow);
nla_nest_end(skb, start);
return 0;
nla_failure:
rcu_read_unlock();
+ unlock_sock_fast(sk, slow);
nla_nest_cancel(skb, start);
return err;
}
-static size_t subflow_get_info_size(const struct sock *sk)
+static size_t subflow_get_info_size(const struct sock *sk, bool net_admin)
{
size_t size = 0;
size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */
- nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */
- nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */
- nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */
- nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
- nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */
nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */
nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */
0;
+
+ if (net_admin)
+ size += nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */
+ nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
+ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */
+ 0;
+
return size;
}
diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c
index d237d142171c..82ec15bcfd7f 100644
--- a/net/mptcp/fastopen.c
+++ b/net/mptcp/fastopen.c
@@ -9,11 +9,18 @@
void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
struct request_sock *req)
{
- struct sock *ssk = subflow->tcp_sock;
- struct sock *sk = subflow->conn;
+ struct sock *sk, *ssk;
struct sk_buff *skb;
struct tcp_sock *tp;
+ /* on early fallback the subflow context is deleted by
+ * subflow_syn_recv_sock()
+ */
+ if (!subflow)
+ return;
+
+ ssk = subflow->tcp_sock;
+ sk = subflow->conn;
tp = tcp_sk(ssk);
subflow->is_mptfo = 1;
@@ -25,7 +32,8 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
/* dequeue the skb from sk receive queue */
__skb_unlink(skb, &ssk->sk_receive_queue);
skb_ext_reset(skb);
- skb_orphan(skb);
+
+ mptcp_subflow_lend_fwdmem(subflow, skb);
/* We copy the fastopen data, but that don't belong to the mptcp sequence
* space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset()
@@ -33,41 +41,22 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
tp->copied_seq += skb->len;
subflow->ssn_offset += skb->len;
- /* initialize a dummy sequence number, we will update it at MPC
- * completion, if needed
- */
+ /* Only the sequence delta is relevant */
MPTCP_SKB_CB(skb)->map_seq = -skb->len;
MPTCP_SKB_CB(skb)->end_seq = 0;
MPTCP_SKB_CB(skb)->offset = 0;
MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
+ MPTCP_SKB_CB(skb)->cant_coalesce = 1;
mptcp_data_lock(sk);
+ DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk));
- mptcp_set_owner_r(skb, sk);
+ mptcp_borrow_fwdmem(sk, skb);
+ skb_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
+ mptcp_sk(sk)->bytes_received += skb->len;
sk->sk_data_ready(sk);
mptcp_data_unlock(sk);
}
-
-void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
- const struct mptcp_options_received *mp_opt)
-{
- struct sock *sk = (struct sock *)msk;
- struct sk_buff *skb;
-
- mptcp_data_lock(sk);
- skb = skb_peek_tail(&sk->sk_receive_queue);
- if (skb) {
- WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq);
- pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx", sk,
- MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq,
- MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq);
- MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq;
- MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq;
- }
-
- pr_debug("msk=%p ack_seq=%llx", msk, msk->ack_seq);
- mptcp_data_unlock(sk);
-}
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 0dac2863c6e1..f23fda0c55a7 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -15,15 +15,27 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
+ SNMP_MIB_ITEM("MPCapableSYNTXDrop", MPTCP_MIB_MPCAPABLEACTIVEDROP),
+ SNMP_MIB_ITEM("MPCapableSYNTXDisabled", MPTCP_MIB_MPCAPABLEACTIVEDISABLED),
+ SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT),
SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT),
SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX),
+ SNMP_MIB_ITEM("MPJoinSynBackupRx", MPTCP_MIB_JOINSYNBACKUPRX),
SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX),
+ SNMP_MIB_ITEM("MPJoinSynAckBackupRx", MPTCP_MIB_JOINSYNACKBACKUPRX),
SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC),
SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
+ SNMP_MIB_ITEM("MPJoinRejected", MPTCP_MIB_JOINREJECTED),
+ SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX),
+ SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR),
+ SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR),
+ SNMP_MIB_ITEM("MPJoinSynTxConnectErr", MPTCP_MIB_JOINSYNTXCONNECTERR),
SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
+ SNMP_MIB_ITEM("DSSCorruptionFallback", MPTCP_MIB_DSSCORRUPTIONFALLBACK),
+ SNMP_MIB_ITEM("DSSCorruptionReset", MPTCP_MIB_DSSCORRUPTIONRESET),
SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX),
SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH),
@@ -34,7 +46,11 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW),
SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA),
SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR),
+ SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX),
+ SNMP_MIB_ITEM("AddAddrTxDrop", MPTCP_MIB_ADDADDRTXDROP),
SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD),
+ SNMP_MIB_ITEM("EchoAddTx", MPTCP_MIB_ECHOADDTX),
+ SNMP_MIB_ITEM("EchoAddTxDrop", MPTCP_MIB_ECHOADDTXDROP),
SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD),
SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP),
SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX),
@@ -44,6 +60,8 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX),
SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR),
SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP),
+ SNMP_MIB_ITEM("RmAddrTx", MPTCP_MIB_RMADDRTX),
+ SNMP_MIB_ITEM("RmAddrTxDrop", MPTCP_MIB_RMADDRTXDROP),
SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
@@ -53,14 +71,20 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX),
SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX),
SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX),
- SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE),
SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER),
SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED),
SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED),
SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE),
SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("MPCurrEstab", MPTCP_MIB_CURRESTAB),
+ SNMP_MIB_ITEM("Blackhole", MPTCP_MIB_BLACKHOLE),
+ SNMP_MIB_ITEM("MPCapableDataFallback", MPTCP_MIB_MPCAPABLEDATAFALLBACK),
+ SNMP_MIB_ITEM("MD5SigFallback", MPTCP_MIB_MD5SIGFALLBACK),
+ SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),
+ SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK),
+ SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED),
+ SNMP_MIB_ITEM("WinProbe", MPTCP_MIB_WINPROBE),
};
/* mptcp_mib_alloc - allocate percpu mib counters
@@ -83,22 +107,23 @@ bool mptcp_mib_alloc(struct net *net)
void mptcp_seq_show(struct seq_file *seq)
{
- unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1];
+ unsigned long sum[ARRAY_SIZE(mptcp_snmp_list)];
+ const int cnt = ARRAY_SIZE(mptcp_snmp_list);
struct net *net = seq->private;
int i;
seq_puts(seq, "MPTcpExt:");
- for (i = 0; mptcp_snmp_list[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %s", mptcp_snmp_list[i].name);
seq_puts(seq, "\nMPTcpExt:");
memset(sum, 0, sizeof(sum));
if (net->mib.mptcp_statistics)
- snmp_get_cpu_field_batch(sum, mptcp_snmp_list,
- net->mib.mptcp_statistics);
+ snmp_get_cpu_field_batch_cnt(sum, mptcp_snmp_list, cnt,
+ net->mib.mptcp_statistics);
- for (i = 0; mptcp_snmp_list[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %lu", sum[i]);
seq_putc(seq, '\n');
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 2be3596374f4..812218b5ed2b 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -1,5 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
+#include <net/inet_common.h>
+
enum linux_mptcp_mib_field {
MPTCP_MIB_NUM = 0,
MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */
@@ -8,15 +10,27 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */
MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
+ MPTCP_MIB_MPCAPABLEACTIVEDROP, /* Client-side fallback due to a MPC drop */
+ MPTCP_MIB_MPCAPABLEACTIVEDISABLED, /* Client-side disabled due to past issues */
+ MPTCP_MIB_MPCAPABLEENDPATTEMPT, /* Prohibited MPC to port-based endp */
MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */
MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */
MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */
MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNBACKUPRX, /* Received a SYN + MP_JOIN + backup flag */
MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */
+ MPTCP_MIB_JOINSYNACKBACKUPRX, /* Received a SYN/ACK + MP_JOIN + backup flag */
MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */
MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
+ MPTCP_MIB_JOINREJECTED, /* The PM rejected the JOIN request */
+ MPTCP_MIB_JOINSYNTX, /* Sending a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNTXCREATSKERR, /* Not able to create a socket when sending a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNTXCONNECTERR, /* Not able to connect() when sending a SYN + MP_JOIN */
MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
+ MPTCP_MIB_DSSCORRUPTIONFALLBACK,/* DSS corruption detected, fallback */
+ MPTCP_MIB_DSSCORRUPTIONRESET, /* DSS corruption detected, MPJ subflow reset */
MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */
MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */
@@ -27,7 +41,15 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_NODSSWINDOW, /* Segments not in MPTCP windows */
MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */
MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */
+ MPTCP_MIB_ADDADDRTX, /* Sent ADD_ADDR with echo-flag=0 */
+ MPTCP_MIB_ADDADDRTXDROP, /* ADD_ADDR with echo-flag=0 not send due to
+ * resource exhaustion
+ */
MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */
+ MPTCP_MIB_ECHOADDTX, /* Send ADD_ADDR with echo-flag=1 */
+ MPTCP_MIB_ECHOADDTXDROP, /* ADD_ADDR with echo-flag=1 not send due
+ * to resource exhaustion
+ */
MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */
MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */
MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */
@@ -37,6 +59,8 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */
MPTCP_MIB_RMADDR, /* Received RM_ADDR */
MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */
+ MPTCP_MIB_RMADDRTX, /* Sent RM_ADDR */
+ MPTCP_MIB_RMADDRTXDROP, /* RM_ADDR not sent due to resource exhaustion */
MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */
MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */
MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */
@@ -46,7 +70,6 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */
MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */
MPTCP_MIB_MPRSTRX, /* Received a MP_RST */
- MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */
MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */
MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */
MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */
@@ -55,6 +78,16 @@ enum linux_mptcp_mib_field {
* conflict with another subflow while updating msk rcv wnd
*/
MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */
+ MPTCP_MIB_CURRESTAB, /* Current established MPTCP connections */
+ MPTCP_MIB_BLACKHOLE, /* A blackhole has been detected */
+ MPTCP_MIB_MPCAPABLEDATAFALLBACK, /* Missing DSS/MPC+data on first
+ * established packet
+ */
+ MPTCP_MIB_MD5SIGFALLBACK, /* Conflicting TCP option enabled */
+ MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */
+ MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */
+ MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk status */
+ MPTCP_MIB_WINPROBE, /* MPTCP-level zero window probe */
__MPTCP_MIB_MAX
};
@@ -63,6 +96,14 @@ struct mptcp_mib {
unsigned long mibs[LINUX_MIB_MPTCP_MAX];
};
+static inline void MPTCP_ADD_STATS(struct net *net,
+ enum linux_mptcp_mib_field field,
+ int val)
+{
+ if (likely(net->mib.mptcp_statistics))
+ SNMP_ADD_STATS(net->mib.mptcp_statistics, field, val);
+}
+
static inline void MPTCP_INC_STATS(struct net *net,
enum linux_mptcp_mib_field field)
{
@@ -77,4 +118,11 @@ static inline void __MPTCP_INC_STATS(struct net *net,
__SNMP_INC_STATS(net->mib.mptcp_statistics, field);
}
+static inline void MPTCP_DEC_STATS(struct net *net,
+ enum linux_mptcp_mib_field field)
+{
+ if (likely(net->mib.mptcp_statistics))
+ SNMP_DEC_STATS(net->mib.mptcp_statistics, field);
+}
+
bool mptcp_mib_alloc(struct net *net);
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index 8df1bdb647e2..136c2d05c0ee 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -10,15 +10,14 @@
#include <linux/net.h>
#include <linux/inet_diag.h>
#include <net/netlink.h>
-#include <uapi/linux/mptcp.h>
#include "protocol.h"
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
@@ -77,9 +76,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba
const struct inet_diag_req_v2 *r,
bool net_admin)
{
- struct inet_diag_dump_data *cb_data = cb->data;
struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
- struct nlattr *bc = cb_data->inet_diag_nla_bc;
struct net *net = sock_net(skb->sk);
struct inet_hashinfo *hinfo;
int i;
@@ -122,7 +119,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba
if (!refcount_inc_not_zero(&sk->sk_refcnt))
goto next_listen;
- ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+ ret = sk_diag_dump(sk, skb, cb, r, net_admin);
sock_put(sk);
@@ -155,15 +152,10 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
- struct inet_diag_dump_data *cb_data;
struct mptcp_sock *msk;
- struct nlattr *bc;
BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx));
- cb_data = cb->data;
- bc = cb_data->inet_diag_nla_bc;
-
while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot,
&diag_ctx->s_num)) != NULL) {
struct inet_sock *inet = (struct inet_sock *)msk;
@@ -182,7 +174,7 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
r->id.idiag_dport)
goto next;
- ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+ ret = sk_diag_dump(sk, skb, cb, r, net_admin);
next:
sock_put(sk);
if (ret < 0) {
@@ -203,7 +195,8 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_info *info = _info;
- r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_rqueue = sk_rmem_alloc_get(sk) +
+ READ_ONCE(mptcp_sk(sk)->backlog_len);
r->idiag_wqueue = sk_wmem_alloc_get(sk);
if (inet_sk_state_load(sk) == TCP_LISTEN) {
@@ -225,6 +218,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
}
static const struct inet_diag_handler mptcp_diag_handler = {
+ .owner = THIS_MODULE,
.dump = mptcp_diag_dump,
.dump_one = mptcp_diag_dump_one,
.idiag_get_info = mptcp_diag_get_info,
@@ -245,4 +239,5 @@ static void __exit mptcp_diag_exit(void)
module_init(mptcp_diag_init);
module_exit(mptcp_diag_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPTCP socket monitoring via SOCK_DIAG");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */);
diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c
new file mode 100644
index 000000000000..c180930a8e0e
--- /dev/null
+++ b/net/mptcp/mptcp_pm_gen.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/mptcp_pm.yaml */
+/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "mptcp_pm_gen.h"
+
+#include <uapi/linux/mptcp_pm.h>
+
+/* Common nested types */
+const struct nla_policy mptcp_pm_address_nl_policy[MPTCP_PM_ADDR_ATTR_IF_IDX + 1] = {
+ [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
+ [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
+ [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_BE32, },
+ [MPTCP_PM_ADDR_ATTR_ADDR6] = NLA_POLICY_EXACT_LEN(16),
+ [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16, },
+ [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32, },
+ [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32, },
+};
+
+/* MPTCP_PM_CMD_ADD_ADDR - do */
+const struct nla_policy mptcp_pm_add_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1] = {
+ [MPTCP_PM_ENDPOINT_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+};
+
+/* MPTCP_PM_CMD_DEL_ADDR - do */
+const struct nla_policy mptcp_pm_del_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1] = {
+ [MPTCP_PM_ENDPOINT_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+};
+
+/* MPTCP_PM_CMD_GET_ADDR - do */
+const struct nla_policy mptcp_pm_get_addr_nl_policy[MPTCP_PM_ATTR_TOKEN + 1] = {
+ [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+ [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+};
+
+/* MPTCP_PM_CMD_FLUSH_ADDRS - do */
+const struct nla_policy mptcp_pm_flush_addrs_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1] = {
+ [MPTCP_PM_ENDPOINT_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+};
+
+/* MPTCP_PM_CMD_SET_LIMITS - do */
+const struct nla_policy mptcp_pm_set_limits_nl_policy[MPTCP_PM_ATTR_SUBFLOWS + 1] = {
+ [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
+};
+
+/* MPTCP_PM_CMD_GET_LIMITS - do */
+const struct nla_policy mptcp_pm_get_limits_nl_policy[MPTCP_PM_ATTR_SUBFLOWS + 1] = {
+ [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
+};
+
+/* MPTCP_PM_CMD_SET_FLAGS - do */
+const struct nla_policy mptcp_pm_set_flags_nl_policy[MPTCP_PM_ATTR_ADDR_REMOTE + 1] = {
+ [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+ [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_ADDR_REMOTE] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+};
+
+/* MPTCP_PM_CMD_ANNOUNCE - do */
+const struct nla_policy mptcp_pm_announce_nl_policy[MPTCP_PM_ATTR_TOKEN + 1] = {
+ [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+ [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+};
+
+/* MPTCP_PM_CMD_REMOVE - do */
+const struct nla_policy mptcp_pm_remove_nl_policy[MPTCP_PM_ATTR_LOC_ID + 1] = {
+ [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, },
+};
+
+/* MPTCP_PM_CMD_SUBFLOW_CREATE - do */
+const struct nla_policy mptcp_pm_subflow_create_nl_policy[MPTCP_PM_ATTR_ADDR_REMOTE + 1] = {
+ [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+ [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_ADDR_REMOTE] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+};
+
+/* MPTCP_PM_CMD_SUBFLOW_DESTROY - do */
+const struct nla_policy mptcp_pm_subflow_destroy_nl_policy[MPTCP_PM_ATTR_ADDR_REMOTE + 1] = {
+ [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+ [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_ADDR_REMOTE] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy),
+};
+
+/* Ops table for mptcp_pm */
+const struct genl_ops mptcp_pm_nl_ops[11] = {
+ {
+ .cmd = MPTCP_PM_CMD_ADD_ADDR,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_add_addr_doit,
+ .policy = mptcp_pm_add_addr_nl_policy,
+ .maxattr = MPTCP_PM_ENDPOINT_ADDR,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_DEL_ADDR,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_del_addr_doit,
+ .policy = mptcp_pm_del_addr_nl_policy,
+ .maxattr = MPTCP_PM_ENDPOINT_ADDR,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_GET_ADDR,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_get_addr_doit,
+ .dumpit = mptcp_pm_nl_get_addr_dumpit,
+ .policy = mptcp_pm_get_addr_nl_policy,
+ .maxattr = MPTCP_PM_ATTR_TOKEN,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_FLUSH_ADDRS,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_flush_addrs_doit,
+ .policy = mptcp_pm_flush_addrs_nl_policy,
+ .maxattr = MPTCP_PM_ENDPOINT_ADDR,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SET_LIMITS,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_set_limits_doit,
+ .policy = mptcp_pm_set_limits_nl_policy,
+ .maxattr = MPTCP_PM_ATTR_SUBFLOWS,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_GET_LIMITS,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_get_limits_doit,
+ .policy = mptcp_pm_get_limits_nl_policy,
+ .maxattr = MPTCP_PM_ATTR_SUBFLOWS,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SET_FLAGS,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_set_flags_doit,
+ .policy = mptcp_pm_set_flags_nl_policy,
+ .maxattr = MPTCP_PM_ATTR_ADDR_REMOTE,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_ANNOUNCE,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_announce_doit,
+ .policy = mptcp_pm_announce_nl_policy,
+ .maxattr = MPTCP_PM_ATTR_TOKEN,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_REMOVE,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_remove_doit,
+ .policy = mptcp_pm_remove_nl_policy,
+ .maxattr = MPTCP_PM_ATTR_LOC_ID,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_subflow_create_doit,
+ .policy = mptcp_pm_subflow_create_nl_policy,
+ .maxattr = MPTCP_PM_ATTR_ADDR_REMOTE,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .doit = mptcp_pm_nl_subflow_destroy_doit,
+ .policy = mptcp_pm_subflow_destroy_nl_policy,
+ .maxattr = MPTCP_PM_ATTR_ADDR_REMOTE,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+};
diff --git a/net/mptcp/mptcp_pm_gen.h b/net/mptcp/mptcp_pm_gen.h
new file mode 100644
index 000000000000..b9280bcb43f5
--- /dev/null
+++ b/net/mptcp/mptcp_pm_gen.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/mptcp_pm.yaml */
+/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#ifndef _LINUX_MPTCP_PM_GEN_H
+#define _LINUX_MPTCP_PM_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/mptcp_pm.h>
+
+/* Common nested types */
+extern const struct nla_policy mptcp_pm_address_nl_policy[MPTCP_PM_ADDR_ATTR_IF_IDX + 1];
+
+extern const struct nla_policy mptcp_pm_add_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1];
+
+extern const struct nla_policy mptcp_pm_del_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1];
+
+extern const struct nla_policy mptcp_pm_get_addr_nl_policy[MPTCP_PM_ATTR_TOKEN + 1];
+
+extern const struct nla_policy mptcp_pm_flush_addrs_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1];
+
+extern const struct nla_policy mptcp_pm_set_limits_nl_policy[MPTCP_PM_ATTR_SUBFLOWS + 1];
+
+extern const struct nla_policy mptcp_pm_get_limits_nl_policy[MPTCP_PM_ATTR_SUBFLOWS + 1];
+
+extern const struct nla_policy mptcp_pm_set_flags_nl_policy[MPTCP_PM_ATTR_ADDR_REMOTE + 1];
+
+extern const struct nla_policy mptcp_pm_announce_nl_policy[MPTCP_PM_ATTR_TOKEN + 1];
+
+extern const struct nla_policy mptcp_pm_remove_nl_policy[MPTCP_PM_ATTR_LOC_ID + 1];
+
+extern const struct nla_policy mptcp_pm_subflow_create_nl_policy[MPTCP_PM_ATTR_ADDR_REMOTE + 1];
+
+extern const struct nla_policy mptcp_pm_subflow_destroy_nl_policy[MPTCP_PM_ATTR_ADDR_REMOTE + 1];
+
+/* Ops table for mptcp_pm */
+extern const struct genl_ops mptcp_pm_nl_ops[11];
+
+int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info);
+int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb,
+ struct genl_info *info);
+
+#endif /* _LINUX_MPTCP_PM_GEN_H */
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 5ded85e2c374..f24ae7d40e88 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -116,51 +116,46 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD;
ptr += 2;
}
- pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u",
+ pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u\n",
version, flags, opsize, mp_opt->sndr_key,
mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum);
break;
case MPTCPOPT_MP_JOIN:
- mp_opt->suboptions |= OPTIONS_MPTCP_MPJ;
if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
+ mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYN;
mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
mp_opt->join_id = *ptr++;
mp_opt->token = get_unaligned_be32(ptr);
ptr += 4;
mp_opt->nonce = get_unaligned_be32(ptr);
ptr += 4;
- pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
+ pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u\n",
mp_opt->backup, mp_opt->join_id,
mp_opt->token, mp_opt->nonce);
} else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
+ mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYNACK;
mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
mp_opt->join_id = *ptr++;
mp_opt->thmac = get_unaligned_be64(ptr);
ptr += 8;
mp_opt->nonce = get_unaligned_be32(ptr);
ptr += 4;
- pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u\n",
mp_opt->backup, mp_opt->join_id,
mp_opt->thmac, mp_opt->nonce);
} else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
+ mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK;
ptr += 2;
memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
- pr_debug("MP_JOIN hmac");
- } else {
- mp_opt->suboptions &= ~OPTIONS_MPTCP_MPJ;
+ pr_debug("MP_JOIN hmac\n");
}
break;
case MPTCPOPT_DSS:
- pr_debug("DSS");
+ pr_debug("DSS\n");
ptr++;
- /* we must clear 'mpc_map' be able to detect MP_CAPABLE
- * map vs DSS map in mptcp_incoming_options(), and reconstruct
- * map info accordingly
- */
- mp_opt->mpc_map = 0;
flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
@@ -168,7 +163,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
- pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
+ pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d\n",
mp_opt->data_fin, mp_opt->dsn64,
mp_opt->use_map, mp_opt->ack64,
mp_opt->use_ack);
@@ -206,7 +201,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
ptr += 4;
}
- pr_debug("data_ack=%llu", mp_opt->data_ack);
+ pr_debug("data_ack=%llu\n", mp_opt->data_ack);
}
if (mp_opt->use_map) {
@@ -230,7 +225,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
ptr += 2;
}
- pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n",
mp_opt->data_seq, mp_opt->subflow_seq,
mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD),
mp_opt->csum);
@@ -292,7 +287,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->ahmac = get_unaligned_be64(ptr);
ptr += 8;
}
- pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d",
+ pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d\n",
(mp_opt->addr.family == AF_INET6) ? "6" : "",
mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port));
break;
@@ -308,7 +303,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE;
for (i = 0; i < mp_opt->rm_list.nr; i++)
mp_opt->rm_list.ids[i] = *ptr++;
- pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr);
+ pr_debug("RM_ADDR: rm_list_nr=%d\n", mp_opt->rm_list.nr);
break;
case MPTCPOPT_MP_PRIO:
@@ -317,7 +312,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->suboptions |= OPTION_MPTCP_PRIO;
mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP;
- pr_debug("MP_PRIO: prio=%d", mp_opt->backup);
+ pr_debug("MP_PRIO: prio=%d\n", mp_opt->backup);
break;
case MPTCPOPT_MP_FASTCLOSE:
@@ -328,7 +323,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->rcvr_key = get_unaligned_be64(ptr);
ptr += 8;
mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE;
- pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key);
+ pr_debug("MP_FASTCLOSE: recv_key=%llu\n", mp_opt->rcvr_key);
break;
case MPTCPOPT_RST:
@@ -342,7 +337,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
flags = *ptr++;
mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT;
mp_opt->reset_reason = *ptr;
- pr_debug("MP_RST: transient=%u reason=%u",
+ pr_debug("MP_RST: transient=%u reason=%u\n",
mp_opt->reset_transient, mp_opt->reset_reason);
break;
@@ -353,7 +348,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
ptr += 2;
mp_opt->suboptions |= OPTION_MPTCP_FAIL;
mp_opt->fail_seq = get_unaligned_be64(ptr);
- pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq);
+ pr_debug("MP_FAIL: data_seq=%llu\n", mp_opt->fail_seq);
break;
default:
@@ -368,8 +363,11 @@ void mptcp_get_options(const struct sk_buff *skb,
const unsigned char *ptr;
int length;
- /* initialize option status */
- mp_opt->suboptions = 0;
+ /* Ensure that casting the whole status to u32 is efficient and safe */
+ BUILD_BUG_ON(sizeof_field(struct mptcp_options_received, status) != sizeof(u32));
+ BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_options_received, status),
+ sizeof(u32)));
+ *(u32 *)&mp_opt->status = 0;
length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
@@ -416,7 +414,7 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
*size = TCPOLEN_MPTCP_MPC_SYN;
return true;
} else if (subflow->request_join) {
- pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
+ pr_debug("remote_token=%u, nonce=%u\n", subflow->remote_token,
subflow->local_nonce);
opts->suboptions = OPTION_MPTCP_MPJ_SYN;
opts->join_id = subflow->local_id;
@@ -434,7 +432,6 @@ static void clear_3rdack_retransmission(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
sk_stop_timer(sk, &icsk->icsk_delack_timer);
- icsk->icsk_ack.timeout = 0;
icsk->icsk_ack.ato = 0;
icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
}
@@ -442,7 +439,6 @@ static void clear_3rdack_retransmission(struct sock *sk)
static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
bool snd_data_fin_enable,
unsigned int *size,
- unsigned int remaining,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -461,7 +457,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
return false;
/* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */
- if (subflow->fully_established || snd_data_fin_enable ||
+ if (READ_ONCE(subflow->fully_established) || snd_data_fin_enable ||
subflow->snd_isn != TCP_SKB_CB(skb)->seq ||
sk->sk_state != TCP_ESTABLISHED)
return false;
@@ -500,7 +496,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
*size = TCPOLEN_MPTCP_MPC_ACK;
}
- pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
+ pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d\n",
subflow, subflow->local_key, subflow->remote_key,
data_len);
@@ -509,7 +505,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
opts->suboptions = OPTION_MPTCP_MPJ_ACK;
memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
*size = TCPOLEN_MPTCP_MPJ_ACK;
- pr_debug("subflow=%p", subflow);
+ pr_debug("subflow=%p\n", subflow);
/* we can use the full delegate action helper only from BH context
* If we are in process context - sk is flushing the backlog at
@@ -556,7 +552,6 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
bool snd_data_fin_enable,
unsigned int *size,
- unsigned int remaining,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -580,7 +575,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
opts->ext_copy = *mpext;
}
- remaining -= map_size;
dss_size = map_size;
if (skb && snd_data_fin_enable)
mptcp_write_data_fin(subflow, skb, &opts->ext_copy);
@@ -609,7 +603,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
}
opts->ext_copy.use_ack = 1;
opts->suboptions = OPTION_MPTCP_DSS;
- WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
/* Add kind/length/subtype/flag overhead if mapping is not populated */
if (dss_size == 0)
@@ -657,6 +650,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
bool drop_other_suboptions = false;
unsigned int opt_size = *size;
+ struct mptcp_addr_info addr;
bool echo;
int len;
@@ -665,19 +659,26 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
*/
if (!mptcp_pm_should_add_signal(msk) ||
(opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) ||
- !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr,
+ !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr,
&echo, &drop_other_suboptions))
return false;
+ /*
+ * Later on, mptcp_write_options() will enforce mutually exclusion with
+ * DSS, bail out if such option is set and we can't drop it.
+ */
if (drop_other_suboptions)
remaining += opt_size;
- len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port);
+ else if (opts->suboptions & OPTION_MPTCP_DSS)
+ return false;
+
+ len = mptcp_add_addr_len(addr.family, echo, !!addr.port);
if (remaining < len)
return false;
*size = len;
if (drop_other_suboptions) {
- pr_debug("drop other suboptions");
+ pr_debug("drop other suboptions\n");
opts->suboptions = 0;
/* note that e.g. DSS could have written into the memory
@@ -688,13 +689,17 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
opts->ahmac = 0;
*size -= opt_size;
}
+ opts->addr = addr;
opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
if (!echo) {
- opts->ahmac = add_addr_generate_hmac(msk->local_key,
- msk->remote_key,
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX);
+ opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key),
+ READ_ONCE(msk->remote_key),
&opts->addr);
+ } else {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX);
}
- pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d",
+ pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n",
opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port));
return true;
@@ -725,8 +730,8 @@ static bool mptcp_established_options_rm_addr(struct sock *sk,
opts->rm_list = rm_list;
for (i = 0; i < opts->rm_list.nr; i++)
- pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]);
-
+ pr_debug("rm_list_ids[%d]=%d\n", i, opts->rm_list.ids[i]);
+ MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr);
return true;
}
@@ -751,7 +756,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk,
opts->suboptions |= OPTION_MPTCP_PRIO;
opts->backup = subflow->request_bkup;
- pr_debug("prio=%d", opts->backup);
+ pr_debug("prio=%d\n", opts->backup);
return true;
}
@@ -791,9 +796,9 @@ static bool mptcp_established_options_fastclose(struct sock *sk,
*size = TCPOLEN_MPTCP_FASTCLOSE;
opts->suboptions |= OPTION_MPTCP_FASTCLOSE;
- opts->rcvr_key = msk->remote_key;
+ opts->rcvr_key = READ_ONCE(msk->remote_key);
- pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
+ pr_debug("FASTCLOSE key=%llu\n", opts->rcvr_key);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX);
return true;
}
@@ -815,7 +820,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk,
opts->suboptions |= OPTION_MPTCP_FAIL;
opts->fail_seq = subflow->map_seq;
- pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq);
+ pr_debug("MP_FAIL fail_seq=%llu\n", opts->fail_seq);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
return true;
@@ -833,8 +838,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
opts->suboptions = 0;
+ /* Force later mptcp_write_options(), but do not use any actual
+ * option space.
+ */
if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb)))
- return false;
+ return true;
if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) ||
@@ -851,9 +859,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
}
snd_data_fin = mptcp_data_fin_enabled(msk);
- if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts))
+ if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts))
ret = true;
- else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) {
+ else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) {
unsigned int mp_fail_size;
ret = true;
@@ -903,16 +911,16 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
opts->csum_reqd = subflow_req->csum_reqd;
opts->allow_join_id0 = subflow_req->allow_join_id0;
*size = TCPOLEN_MPTCP_MPC_SYNACK;
- pr_debug("subflow_req=%p, local_key=%llu",
+ pr_debug("subflow_req=%p, local_key=%llu\n",
subflow_req, subflow_req->local_key);
return true;
} else if (subflow_req->mp_join) {
opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
- opts->backup = subflow_req->backup;
+ opts->backup = subflow_req->request_bkup;
opts->join_id = subflow_req->local_id;
opts->thmac = subflow_req->thmac;
opts->nonce = subflow_req->local_nonce;
- pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u\n",
subflow_req, opts->backup, opts->join_id,
opts->thmac, opts->nonce);
*size = TCPOLEN_MPTCP_MPJ_SYNACK;
@@ -929,7 +937,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
/* here we can process OoO, in-window pkts, only in-sequence 4th ack
* will make the subflow fully established
*/
- if (likely(subflow->fully_established)) {
+ if (likely(READ_ONCE(subflow->fully_established))) {
/* on passive sockets, check for 3rd ack retransmission
* note that msk is always set by subflow_syn_recv_sock()
* for mp_join subflows
@@ -957,13 +965,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
if (subflow->remote_key_valid &&
(((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) ||
- ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) {
+ ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) &&
+ (!mp_opt->echo || subflow->mp_join)))) {
/* subflows are fully established as soon as we get any
* additional ack, including ADD_ADDR.
*/
- subflow->fully_established = 1;
- WRITE_ONCE(msk->fully_established, true);
- goto check_notify;
+ goto set_fully_established;
}
/* If the first established packet does not contain MP_CAPABLE + data
@@ -974,18 +981,23 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
if (subflow->mp_join)
goto reset;
subflow->mp_capable = 0;
- pr_fallback(msk);
- mptcp_do_fallback(ssk);
+ if (!mptcp_try_fallback(ssk, MPTCP_MIB_MPCAPABLEDATAFALLBACK)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_FALLBACKFAILED);
+ goto reset;
+ }
return false;
}
+ if (unlikely(!READ_ONCE(msk->pm.server_side)))
+ pr_warn_once("bogus mpc option on established client sk");
+
+set_fully_established:
if (mp_opt->deny_join_id0)
WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
-set_fully_established:
- if (unlikely(!READ_ONCE(msk->pm.server_side)))
- pr_warn_once("bogus mpc option on established client sk");
- mptcp_subflow_fully_established(subflow, mp_opt);
+ mptcp_data_lock((struct sock *)msk);
+ __mptcp_subflow_fully_established(msk, subflow, mp_opt);
+ mptcp_data_unlock((struct sock *)msk);
check_notify:
/* if the subflow is not already linked into the conn_list, we can't
@@ -1001,7 +1013,7 @@ check_notify:
clear_3rdack_retransmission(ssk);
mptcp_pm_subflow_established(msk);
} else {
- mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC);
+ mptcp_pm_fully_established(msk, ssk);
}
return true;
@@ -1026,6 +1038,37 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq)
return cur_seq;
}
+static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
+{
+ msk->bytes_acked += new_snd_una - msk->snd_una;
+ WRITE_ONCE(msk->snd_una, new_snd_una);
+}
+
+static void rwin_update(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct tcp_sock *tp = tcp_sk(ssk);
+ u64 mptcp_rcv_wnd;
+
+ /* Avoid touching extra cachelines if TCP is going to accept this
+ * skb without filling the TCP-level window even with a possibly
+ * outdated mptcp-level rwin.
+ */
+ if (!skb->len || skb->len < tcp_receive_window(tp))
+ return;
+
+ mptcp_rcv_wnd = atomic64_read(&msk->rcv_wnd_sent);
+ if (!after64(mptcp_rcv_wnd, subflow->rcv_wnd_sent))
+ return;
+
+ /* Some other subflow grew the mptcp-level rwin since rcv_wup,
+ * resync.
+ */
+ tp->rcv_wnd += mptcp_rcv_wnd - subflow->rcv_wnd_sent;
+ subflow->rcv_wnd_sent = mptcp_rcv_wnd;
+}
+
static void ack_update_msk(struct mptcp_sock *msk,
struct sock *ssk,
struct mptcp_options_received *mp_opt)
@@ -1050,21 +1093,22 @@ static void ack_update_msk(struct mptcp_sock *msk,
new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
if (after64(new_wnd_end, msk->wnd_end))
- msk->wnd_end = new_wnd_end;
+ WRITE_ONCE(msk->wnd_end, new_wnd_end);
/* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
- if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)))
+ if (after64(msk->wnd_end, snd_nxt))
__mptcp_check_push(sk, ssk);
if (after64(new_snd_una, old_snd_una)) {
- msk->snd_una = new_snd_una;
+ __mptcp_snd_una_update(msk, new_snd_una);
__mptcp_data_acked(sk);
}
+ msk->last_ack_recv = tcp_jiffies32;
mptcp_data_unlock(sk);
trace_ack_update_msk(mp_opt->data_ack,
old_snd_una, new_snd_una,
- new_wnd_end, msk->wnd_end);
+ new_wnd_end, READ_ONCE(msk->wnd_end));
}
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
@@ -1092,8 +1136,8 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
if (mp_opt->echo)
return true;
- hmac = add_addr_generate_hmac(msk->remote_key,
- msk->local_key,
+ hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key),
+ READ_ONCE(msk->local_key),
&mp_opt->addr);
pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
@@ -1102,7 +1146,9 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
return hmac == mp_opt->ahmac;
}
-/* Return false if a subflow has been reset, else return true */
+/* Return false in case of error (or subflow has been reset),
+ * else return true.
+ */
bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -1119,6 +1165,12 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
mptcp_data_lock(subflow->conn);
if (sk_stream_memory_free(sk))
__mptcp_check_push(subflow->conn, sk);
+
+ /* on fallback we just need to ignore the msk-level snd_una, as
+ * this is really plain TCP
+ */
+ __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt));
+
__mptcp_data_acked(subflow->conn);
mptcp_data_unlock(subflow->conn);
return true;
@@ -1134,7 +1186,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) {
if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) &&
- msk->local_key == mp_opt.rcvr_key) {
+ READ_ONCE(msk->local_key) == mp_opt.rcvr_key) {
WRITE_ONCE(msk->rcv_fastclose, true);
mptcp_schedule_work((struct sock *)msk);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX);
@@ -1184,6 +1236,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
*/
if (mp_opt.use_ack)
ack_update_msk(msk, sk, &mp_opt);
+ rwin_update(msk, sk, skb);
/* Zero-data-length packets are dropped by the caller and not
* propagated to the MPTCP layer, so the skb extension does not
@@ -1192,16 +1245,15 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
*/
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
if (mp_opt.data_fin && mp_opt.data_len == 1 &&
- mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64) &&
- schedule_work(&msk->work))
- sock_hold(subflow->conn);
+ mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64))
+ mptcp_schedule_work((struct sock *)msk);
return true;
}
mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
if (!mpext)
- return true;
+ return false;
memset(mpext, 0, sizeof(*mpext));
@@ -1258,18 +1310,23 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
if (rcv_wnd == rcv_wnd_old)
break;
- if (before64(rcv_wnd_new, rcv_wnd)) {
+
+ rcv_wnd_old = rcv_wnd;
+ if (before64(rcv_wnd_new, rcv_wnd_old)) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE);
goto raise_win;
}
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
- rcv_wnd_old = rcv_wnd;
}
- return;
+ goto update_wspace;
}
if (rcv_wnd_new != rcv_wnd_old) {
raise_win:
+ /* The msk-level rcv wnd is after the tcp level one,
+ * sync the latter.
+ */
+ rcv_wnd_new = rcv_wnd_old;
win = rcv_wnd_old - ack_seq;
tp->rcv_wnd = min_t(u64, win, U32_MAX);
new_win = tp->rcv_wnd;
@@ -1290,6 +1347,24 @@ raise_win:
th->window = htons(new_win);
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED);
}
+
+update_wspace:
+ WRITE_ONCE(msk->old_wspace, tp->rcv_wnd);
+ subflow->rcv_wnd_sent = rcv_wnd_new;
+}
+
+static void mptcp_track_rwin(struct tcp_sock *tp)
+{
+ const struct sock *ssk = (const struct sock *)tp;
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk;
+
+ if (!ssk)
+ return;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ msk = mptcp_sk(subflow->conn);
+ WRITE_ONCE(msk->old_wspace, tp->rcv_wnd);
}
__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
@@ -1584,6 +1659,10 @@ mp_rst:
opts->reset_transient,
opts->reset_reason);
return;
+ } else if (unlikely(!opts->suboptions)) {
+ /* Fallback to TCP */
+ mptcp_track_rwin(tp);
+ return;
}
if (OPTION_MPTCP_PRIO & opts->suboptions) {
@@ -1594,8 +1673,7 @@ mp_rst:
TCPOLEN_MPTCP_PRIO,
opts->backup, TCPOPT_NOP);
- MPTCP_INC_STATS(sock_net((const struct sock *)tp),
- MPTCP_MIB_MPPRIOTX);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX);
}
mp_capable_done:
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 70f0ced3ca86..e2040c327af6 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -5,13 +5,426 @@
*/
#define pr_fmt(fmt) "MPTCP: " fmt
-#include <linux/kernel.h>
-#include <net/tcp.h>
-#include <net/mptcp.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
#include "protocol.h"
-
#include "mib.h"
+#define ADD_ADDR_RETRANS_MAX 3
+
+struct mptcp_pm_add_entry {
+ struct list_head list;
+ struct mptcp_addr_info addr;
+ u8 retrans_times;
+ struct timer_list add_timer;
+ struct mptcp_sock *sock;
+ struct rcu_head rcu;
+};
+
+static DEFINE_SPINLOCK(mptcp_pm_list_lock);
+static LIST_HEAD(mptcp_pm_list);
+
+/* path manager helpers */
+
+/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses,
+ * otherwise allow any matching local/remote pair
+ */
+bool mptcp_pm_addr_families_match(const struct sock *sk,
+ const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *rem)
+{
+ bool mptcp_is_v4 = sk->sk_family == AF_INET;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6);
+ bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6);
+
+ if (mptcp_is_v4)
+ return loc_is_v4 && rem_is_v4;
+
+ if (ipv6_only_sock(sk))
+ return !loc_is_v4 && !rem_is_v4;
+
+ return loc_is_v4 == rem_is_v4;
+#else
+ return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET;
+#endif
+}
+
+bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
+ const struct mptcp_addr_info *b, bool use_port)
+{
+ bool addr_equals = false;
+
+ if (a->family == b->family) {
+ if (a->family == AF_INET)
+ addr_equals = a->addr.s_addr == b->addr.s_addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else
+ addr_equals = ipv6_addr_equal(&a->addr6, &b->addr6);
+ } else if (a->family == AF_INET) {
+ if (ipv6_addr_v4mapped(&b->addr6))
+ addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3];
+ } else if (b->family == AF_INET) {
+ if (ipv6_addr_v4mapped(&a->addr6))
+ addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr;
+#endif
+ }
+
+ if (!addr_equals)
+ return false;
+ if (!use_port)
+ return true;
+
+ return a->port == b->port;
+}
+
+void mptcp_local_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr)
+{
+ addr->family = skc->skc_family;
+ addr->port = htons(skc->skc_num);
+ if (addr->family == AF_INET)
+ addr->addr.s_addr = skc->skc_rcv_saddr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ addr->addr6 = skc->skc_v6_rcv_saddr;
+#endif
+}
+
+void mptcp_remote_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr)
+{
+ addr->family = skc->skc_family;
+ addr->port = skc->skc_dport;
+ if (addr->family == AF_INET)
+ addr->addr.s_addr = skc->skc_daddr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ addr->addr6 = skc->skc_v6_daddr;
+#endif
+}
+
+static bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *remote)
+{
+ struct mptcp_addr_info mpc_remote;
+
+ mptcp_remote_address((struct sock_common *)msk, &mpc_remote);
+ return mptcp_addresses_equal(&mpc_remote, remote, remote->port);
+}
+
+bool mptcp_lookup_subflow_by_saddr(const struct list_head *list,
+ const struct mptcp_addr_info *saddr)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_addr_info cur;
+ struct sock_common *skc;
+
+ list_for_each_entry(subflow, list, node) {
+ skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
+
+ mptcp_local_address(skc, &cur);
+ if (mptcp_addresses_equal(&cur, saddr, saddr->port))
+ return true;
+ }
+
+ return false;
+}
+
+static struct mptcp_pm_add_entry *
+mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *entry;
+
+ lockdep_assert_held(&msk->pm.lock);
+
+ list_for_each_entry(entry, &msk->pm.anno_list, list) {
+ if (mptcp_addresses_equal(&entry->addr, addr, true))
+ return entry;
+ }
+
+ return NULL;
+}
+
+bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *entry;
+ bool ret;
+
+ entry = mptcp_pm_del_add_timer(msk, addr, false);
+ ret = entry;
+ kfree_rcu(entry, rcu);
+
+ return ret;
+}
+
+bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk)
+{
+ struct mptcp_pm_add_entry *entry;
+ struct mptcp_addr_info saddr;
+ bool ret = false;
+
+ mptcp_local_address((struct sock_common *)sk, &saddr);
+
+ spin_lock_bh(&msk->pm.lock);
+ list_for_each_entry(entry, &msk->pm.anno_list, list) {
+ if (mptcp_addresses_equal(&entry->addr, &saddr, true)) {
+ ret = true;
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+static void __mptcp_pm_send_ack(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow,
+ bool prio, bool backup)
+{
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow;
+
+ pr_debug("send ack for %s\n",
+ prio ? "mp_prio" :
+ (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"));
+
+ slow = lock_sock_fast(ssk);
+ if (prio) {
+ subflow->send_mp_prio = 1;
+ subflow->request_bkup = backup;
+ }
+
+ __mptcp_subflow_send_ack(ssk);
+ unlock_sock_fast(ssk, slow);
+}
+
+void mptcp_pm_send_ack(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow,
+ bool prio, bool backup)
+{
+ spin_unlock_bh(&msk->pm.lock);
+ __mptcp_pm_send_ack(msk, subflow, prio, backup);
+ spin_lock_bh(&msk->pm.lock);
+}
+
+void mptcp_pm_addr_send_ack(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow, *alt = NULL;
+
+ msk_owned_by_me(msk);
+ lockdep_assert_held(&msk->pm.lock);
+
+ if (!mptcp_pm_should_add_signal(msk) &&
+ !mptcp_pm_should_rm_signal(msk))
+ return;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (__mptcp_subflow_active(subflow)) {
+ if (!subflow->stale) {
+ mptcp_pm_send_ack(msk, subflow, false, false);
+ return;
+ }
+
+ if (!alt)
+ alt = subflow;
+ }
+ }
+
+ if (alt)
+ mptcp_pm_send_ack(msk, alt, false, false);
+}
+
+int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ struct mptcp_addr_info *rem,
+ u8 bkup)
+{
+ struct mptcp_subflow_context *subflow;
+
+ pr_debug("bkup=%d\n", bkup);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct mptcp_addr_info local, remote;
+
+ mptcp_local_address((struct sock_common *)ssk, &local);
+ if (!mptcp_addresses_equal(&local, addr, addr->port))
+ continue;
+
+ if (rem && rem->family != AF_UNSPEC) {
+ mptcp_remote_address((struct sock_common *)ssk, &remote);
+ if (!mptcp_addresses_equal(&remote, rem, rem->port))
+ continue;
+ }
+
+ __mptcp_pm_send_ack(msk, subflow, true, bkup);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk)
+{
+ const struct net *net = sock_net((struct sock *)msk);
+ unsigned int rto = mptcp_get_add_addr_timeout(net);
+ struct mptcp_subflow_context *subflow;
+ unsigned int max = 0;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct inet_connection_sock *icsk = inet_csk(ssk);
+
+ if (icsk->icsk_rto > max)
+ max = icsk->icsk_rto;
+ }
+
+ if (max && max < rto)
+ rto = max;
+
+ return rto;
+}
+
+static void mptcp_pm_add_timer(struct timer_list *timer)
+{
+ struct mptcp_pm_add_entry *entry = timer_container_of(entry, timer,
+ add_timer);
+ struct mptcp_sock *msk = entry->sock;
+ struct sock *sk = (struct sock *)msk;
+ unsigned int timeout;
+
+ pr_debug("msk=%p\n", msk);
+
+ if (!msk)
+ return;
+
+ if (inet_sk_state_load(sk) == TCP_CLOSE)
+ return;
+
+ if (!entry->addr.id)
+ return;
+
+ if (mptcp_pm_should_add_signal_addr(msk)) {
+ sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
+ goto out;
+ }
+
+ timeout = mptcp_adjust_add_addr_timeout(msk);
+ if (!timeout)
+ goto out;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ if (!mptcp_pm_should_add_signal_addr(msk)) {
+ pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id);
+ mptcp_pm_announce_addr(msk, &entry->addr, false);
+ mptcp_pm_add_addr_send_ack(msk);
+ entry->retrans_times++;
+ }
+
+ if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
+ sk_reset_timer(sk, timer,
+ jiffies + (timeout << entry->retrans_times));
+
+ spin_unlock_bh(&msk->pm.lock);
+
+ if (entry->retrans_times == ADD_ADDR_RETRANS_MAX)
+ mptcp_pm_subflow_established(msk);
+
+out:
+ __sock_put(sk);
+}
+
+struct mptcp_pm_add_entry *
+mptcp_pm_del_add_timer(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr, bool check_id)
+{
+ struct mptcp_pm_add_entry *entry;
+ struct sock *sk = (struct sock *)msk;
+ bool stop_timer = false;
+
+ rcu_read_lock();
+
+ spin_lock_bh(&msk->pm.lock);
+ entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
+ if (entry && (!check_id || entry->addr.id == addr->id)) {
+ entry->retrans_times = ADD_ADDR_RETRANS_MAX;
+ stop_timer = true;
+ }
+ if (!check_id && entry)
+ list_del(&entry->list);
+ spin_unlock_bh(&msk->pm.lock);
+
+ /* Note: entry might have been removed by another thread.
+ * We hold rcu_read_lock() to ensure it is not freed under us.
+ */
+ if (stop_timer)
+ sk_stop_timer_sync(sk, &entry->add_timer);
+
+ rcu_read_unlock();
+ return entry;
+}
+
+bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *add_entry = NULL;
+ struct sock *sk = (struct sock *)msk;
+ unsigned int timeout;
+
+ lockdep_assert_held(&msk->pm.lock);
+
+ add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
+
+ if (add_entry) {
+ if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk)))
+ return false;
+
+ goto reset_timer;
+ }
+
+ add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC);
+ if (!add_entry)
+ return false;
+
+ list_add(&add_entry->list, &msk->pm.anno_list);
+
+ add_entry->addr = *addr;
+ add_entry->sock = msk;
+ add_entry->retrans_times = 0;
+
+ timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
+reset_timer:
+ timeout = mptcp_adjust_add_addr_timeout(msk);
+ if (timeout)
+ sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout);
+
+ return true;
+}
+
+static void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_add_entry *entry, *tmp;
+ struct sock *sk = (struct sock *)msk;
+ LIST_HEAD(free_list);
+
+ pr_debug("msk=%p\n", msk);
+
+ spin_lock_bh(&msk->pm.lock);
+ list_splice_init(&msk->pm.anno_list, &free_list);
+ spin_unlock_bh(&msk->pm.lock);
+
+ list_for_each_entry_safe(entry, tmp, &free_list, list) {
+ sk_stop_timer_sync(sk, &entry->add_timer);
+ kfree_rcu(entry, rcu);
+ }
+}
+
/* path manager command handlers */
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
@@ -20,13 +433,14 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk,
{
u8 add_addr = READ_ONCE(msk->pm.addr_signal);
- pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo);
+ pr_debug("msk=%p, local_id=%d, echo=%d\n", msk, addr->id, echo);
lockdep_assert_held(&msk->pm.lock);
if (add_addr &
(echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) {
- pr_warn("addr_signal error, add_addr=%d, echo=%d", add_addr, echo);
+ MPTCP_INC_STATS(sock_net((struct sock *)msk),
+ echo ? MPTCP_MIB_ECHOADDTXDROP : MPTCP_MIB_ADDADDRTXDROP);
return -EINVAL;
}
@@ -45,27 +459,18 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_
{
u8 rm_addr = READ_ONCE(msk->pm.addr_signal);
- pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
+ pr_debug("msk=%p, rm_list_nr=%d\n", msk, rm_list->nr);
if (rm_addr) {
- pr_warn("addr_signal error, rm_addr=%d", rm_addr);
+ MPTCP_ADD_STATS(sock_net((struct sock *)msk),
+ MPTCP_MIB_RMADDRTXDROP, rm_list->nr);
return -EINVAL;
}
msk->pm.rm_list_tx = *rm_list;
rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL);
WRITE_ONCE(msk->pm.addr_signal, rm_addr);
- mptcp_pm_nl_addr_send_ack(msk);
- return 0;
-}
-
-int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
-{
- pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
-
- spin_lock_bh(&msk->pm.lock);
- mptcp_pm_nl_rm_subflow_received(msk, rm_list);
- spin_unlock_bh(&msk->pm.lock);
+ mptcp_pm_addr_send_ack(msk);
return 0;
}
@@ -75,7 +480,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int
{
struct mptcp_pm_data *pm = &msk->pm;
- pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
+ pr_debug("msk=%p, token=%u side=%d\n", msk, READ_ONCE(msk->token), server_side);
WRITE_ONCE(pm->server_side, server_side);
mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC);
@@ -84,16 +489,24 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
- unsigned int subflows_max;
+ unsigned int limit_extra_subflows;
int ret = 0;
- if (mptcp_pm_is_userspace(msk))
- return mptcp_userspace_pm_active(msk);
+ if (mptcp_pm_is_userspace(msk)) {
+ if (mptcp_userspace_pm_active(msk)) {
+ spin_lock_bh(&pm->lock);
+ pm->extra_subflows++;
+ spin_unlock_bh(&pm->lock);
+ return true;
+ }
+ return false;
+ }
- subflows_max = mptcp_pm_get_subflows_max(msk);
+ limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
- pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
- subflows_max, READ_ONCE(pm->accept_subflow));
+ pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk,
+ pm->extra_subflows, limit_extra_subflows,
+ READ_ONCE(pm->accept_subflow));
/* try to avoid acquiring the lock below */
if (!READ_ONCE(pm->accept_subflow))
@@ -101,8 +514,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
spin_lock_bh(&pm->lock);
if (READ_ONCE(pm->accept_subflow)) {
- ret = pm->subflows < subflows_max;
- if (ret && ++pm->subflows == subflows_max)
+ ret = pm->extra_subflows < limit_extra_subflows;
+ if (ret && ++pm->extra_subflows == limit_extra_subflows)
WRITE_ONCE(pm->accept_subflow, false);
}
spin_unlock_bh(&pm->lock);
@@ -116,7 +529,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
enum mptcp_pm_status new_status)
{
- pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
+ pr_debug("msk=%p status=%x new=%lx\n", msk, msk->pm.status,
BIT(new_status));
if (msk->pm.status & BIT(new_status))
return false;
@@ -126,12 +539,12 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
return true;
}
-void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp)
+void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk)
{
struct mptcp_pm_data *pm = &msk->pm;
bool announce = false;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
spin_lock_bh(&pm->lock);
@@ -140,29 +553,32 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk,
* be sure to serve this event only once.
*/
if (READ_ONCE(pm->work_pending) &&
- !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
+ !(pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
- if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0)
+ if ((pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0)
announce = true;
- msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
+ pm->status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
spin_unlock_bh(&pm->lock);
if (announce)
- mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, gfp);
+ mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC);
}
void mptcp_pm_connection_closed(struct mptcp_sock *msk)
{
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
+
+ if (msk->token)
+ mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
}
void mptcp_pm_subflow_established(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
if (!READ_ONCE(pm->work_pending))
return;
@@ -175,14 +591,23 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk)
spin_unlock_bh(&pm->lock);
}
-void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
+void mptcp_pm_subflow_check_next(struct mptcp_sock *msk,
const struct mptcp_subflow_context *subflow)
{
+ struct sock *sk = (struct sock *)msk;
struct mptcp_pm_data *pm = &msk->pm;
bool update_subflows;
- update_subflows = (subflow->request_join || subflow->mp_join) &&
- mptcp_pm_is_kernel(msk);
+ update_subflows = subflow->request_join || subflow->mp_join;
+ if (mptcp_pm_is_userspace(msk)) {
+ if (update_subflows) {
+ spin_lock_bh(&pm->lock);
+ pm->extra_subflows--;
+ spin_unlock_bh(&pm->lock);
+ }
+ return;
+ }
+
if (!READ_ONCE(pm->work_pending) && !update_subflows)
return;
@@ -193,7 +618,8 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
/* Even if this subflow is not really established, tell the PM to try
* to pick the next ones, if possible.
*/
- if (mptcp_pm_nl_check_work_pending(msk))
+ if (mptcp_is_fully_established(sk) &&
+ mptcp_pm_nl_check_work_pending(msk))
mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
spin_unlock_bh(&pm->lock);
@@ -206,7 +632,7 @@ void mptcp_pm_add_addr_received(const struct sock *ssk,
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_pm_data *pm = &msk->pm;
- pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
+ pr_debug("msk=%p remote_id=%d accept=%d\n", msk, addr->id,
READ_ONCE(pm->accept_addr));
mptcp_event_addr_announced(ssk, addr);
@@ -220,7 +646,12 @@ void mptcp_pm_add_addr_received(const struct sock *ssk,
} else {
__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
}
- } else if (!READ_ONCE(pm->accept_addr)) {
+ /* - id0 should not have a different address
+ * - special case for C-flag: linked to fill_local_addresses_vec()
+ */
+ } else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) ||
+ (addr->id > 0 && !READ_ONCE(pm->accept_addr) &&
+ !mptcp_pm_add_addr_c_flag_case(msk))) {
mptcp_pm_announce_addr(msk, addr, true);
mptcp_pm_add_addr_send_ack(msk);
} else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
@@ -237,7 +668,10 @@ void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
{
struct mptcp_pm_data *pm = &msk->pm;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
+
+ if (!READ_ONCE(pm->work_pending))
+ return;
spin_lock_bh(&pm->lock);
@@ -255,13 +689,87 @@ void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk)
mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK);
}
+static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list,
+ enum linux_mptcp_mib_field rm_type)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = (struct sock *)msk;
+ u8 i;
+
+ pr_debug("%s rm_list_nr %d\n",
+ rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr);
+
+ msk_owned_by_me(msk);
+
+ if (sk->sk_state == TCP_LISTEN)
+ return;
+
+ if (!rm_list->nr)
+ return;
+
+ if (list_empty(&msk->conn_list))
+ return;
+
+ for (i = 0; i < rm_list->nr; i++) {
+ u8 rm_id = rm_list->ids[i];
+ bool removed = false;
+
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ u8 remote_id = READ_ONCE(subflow->remote_id);
+ int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
+ u8 id = subflow_get_local_id(subflow);
+
+ if ((1 << inet_sk_state_load(ssk)) &
+ (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE))
+ continue;
+ if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id)
+ continue;
+ if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id)
+ continue;
+
+ pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n",
+ rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow",
+ i, rm_id, id, remote_id, msk->mpc_endpoint_id);
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, how);
+ removed |= subflow->request_join;
+
+ /* the following takes care of updating the subflows counter */
+ mptcp_close_ssk(sk, ssk, subflow);
+ spin_lock_bh(&msk->pm.lock);
+
+ if (rm_type == MPTCP_MIB_RMSUBFLOW)
+ __MPTCP_INC_STATS(sock_net(sk), rm_type);
+ }
+
+ if (rm_type == MPTCP_MIB_RMADDR) {
+ __MPTCP_INC_STATS(sock_net(sk), rm_type);
+ if (removed && mptcp_pm_is_kernel(msk))
+ mptcp_pm_nl_rm_addr(msk, rm_id);
+ }
+ }
+}
+
+static void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk)
+{
+ mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR);
+}
+
+void mptcp_pm_rm_subflow(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list)
+{
+ mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW);
+}
+
void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list)
{
struct mptcp_pm_data *pm = &msk->pm;
u8 i;
- pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr);
+ pr_debug("msk=%p remote_ids_nr=%d\n", msk, rm_list->nr);
for (i = 0; i < rm_list->nr; i++)
mptcp_event_addr_removed(msk, rm_list->ids[i]);
@@ -282,15 +790,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
msk = mptcp_sk(sk);
- if (subflow->backup != bkup) {
+ if (subflow->backup != bkup)
subflow->backup = bkup;
- mptcp_data_lock(sk);
- if (!sock_owned_by_user(sk))
- msk->last_snd = NULL;
- else
- __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags);
- mptcp_data_unlock(sk);
- }
mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
}
@@ -300,25 +801,29 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
- pr_debug("fail_seq=%llu", fail_seq);
+ pr_debug("fail_seq=%llu\n", fail_seq);
- if (!READ_ONCE(msk->allow_infinite_fallback))
+ /* After accepting the fail, we can't create any other subflows */
+ spin_lock_bh(&msk->fallback_lock);
+ if (!msk->allow_infinite_fallback) {
+ spin_unlock_bh(&msk->fallback_lock);
return;
+ }
+ msk->allow_subflows = false;
+ spin_unlock_bh(&msk->fallback_lock);
if (!subflow->fail_tout) {
- pr_debug("send MP_FAIL response and infinite map");
+ pr_debug("send MP_FAIL response and infinite map\n");
subflow->send_mp_fail = 1;
subflow->send_infinite_map = 1;
tcp_send_ack(sk);
} else {
- pr_debug("MP_FAIL response received");
+ pr_debug("MP_FAIL response received\n");
WRITE_ONCE(subflow->fail_tout, 0);
}
}
-/* path manager helpers */
-
bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
unsigned int opt_size, unsigned int remaining,
struct mptcp_addr_info *addr, bool *echo,
@@ -398,7 +903,75 @@ out_unlock:
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
{
- return mptcp_pm_nl_get_local_id(msk, skc);
+ struct mptcp_pm_addr_entry skc_local = { 0 };
+ struct mptcp_addr_info msk_local;
+
+ if (WARN_ON_ONCE(!msk))
+ return -1;
+
+ /* The 0 ID mapping is defined by the first subflow, copied into the msk
+ * addr
+ */
+ mptcp_local_address((struct sock_common *)msk, &msk_local);
+ mptcp_local_address((struct sock_common *)skc, &skc_local.addr);
+ if (mptcp_addresses_equal(&msk_local, &skc_local.addr, false))
+ return 0;
+
+ skc_local.addr.id = 0;
+ skc_local.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
+
+ if (mptcp_pm_is_userspace(msk))
+ return mptcp_userspace_pm_get_local_id(msk, &skc_local);
+ return mptcp_pm_nl_get_local_id(msk, &skc_local);
+}
+
+bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc)
+{
+ struct mptcp_addr_info skc_local;
+
+ mptcp_local_address((struct sock_common *)skc, &skc_local);
+
+ if (mptcp_pm_is_userspace(msk))
+ return mptcp_userspace_pm_is_backup(msk, &skc_local);
+
+ return mptcp_pm_nl_is_backup(msk, &skc_local);
+}
+
+static void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = (struct sock *)msk;
+ unsigned int active_max_loss_cnt;
+ struct net *net = sock_net(sk);
+ unsigned int stale_loss_cnt;
+ bool slow;
+
+ stale_loss_cnt = mptcp_stale_loss_cnt(net);
+ if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt)
+ return;
+
+ /* look for another available subflow not in loss state */
+ active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1);
+ mptcp_for_each_subflow(msk, iter) {
+ if (iter != subflow && mptcp_subflow_active(iter) &&
+ iter->stale_count < active_max_loss_cnt) {
+ /* we have some alternatives, try to mark this subflow as idle ...*/
+ slow = lock_sock_fast(ssk);
+ if (!tcp_rtx_and_write_queues_empty(ssk)) {
+ subflow->stale = 1;
+ __mptcp_retransmit_pending_data(sk);
+ MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE);
+ }
+ unlock_sock_fast(ssk, slow);
+
+ /* always try to push the pending data regardless of re-injections:
+ * we can possibly use backup subflows now, and subflow selection
+ * is cheap under the msk socket lock
+ */
+ __mptcp_push_pending(sk, 0);
+ return;
+ }
+ }
}
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
@@ -413,36 +986,44 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
} else if (subflow->stale_rcv_tstamp == rcv_tstamp) {
if (subflow->stale_count < U8_MAX)
subflow->stale_count++;
- mptcp_pm_nl_subflow_chk_stale(msk, ssk);
+ mptcp_pm_subflows_chk_stale(msk, ssk);
} else {
subflow->stale_count = 0;
mptcp_subflow_set_active(subflow);
}
}
-/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses,
- * otherwise allow any matching local/remote pair
- */
-bool mptcp_pm_addr_families_match(const struct sock *sk,
- const struct mptcp_addr_info *loc,
- const struct mptcp_addr_info *rem)
+void mptcp_pm_worker(struct mptcp_sock *msk)
{
- bool mptcp_is_v4 = sk->sk_family == AF_INET;
+ struct mptcp_pm_data *pm = &msk->pm;
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6);
- bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6);
+ msk_owned_by_me(msk);
- if (mptcp_is_v4)
- return loc_is_v4 && rem_is_v4;
+ if (!(pm->status & MPTCP_PM_WORK_MASK))
+ return;
- if (ipv6_only_sock(sk))
- return !loc_is_v4 && !rem_is_v4;
+ spin_lock_bh(&msk->pm.lock);
- return loc_is_v4 == rem_is_v4;
-#else
- return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET;
-#endif
+ pr_debug("msk=%p status=%x\n", msk, pm->status);
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
+ mptcp_pm_addr_send_ack(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
+ mptcp_pm_rm_addr_recv(msk);
+ }
+ __mptcp_pm_kernel_worker(msk);
+
+ spin_unlock_bh(&msk->pm.lock);
+}
+
+void mptcp_pm_destroy(struct mptcp_sock *msk)
+{
+ mptcp_pm_free_anno_list(msk);
+
+ if (mptcp_pm_is_userspace(msk))
+ mptcp_userspace_pm_free_local_addr_list(msk);
}
void mptcp_pm_data_reset(struct mptcp_sock *msk)
@@ -450,38 +1031,28 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk)
u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk));
struct mptcp_pm_data *pm = &msk->pm;
- pm->add_addr_signaled = 0;
- pm->add_addr_accepted = 0;
- pm->local_addr_used = 0;
- pm->subflows = 0;
+ memset(&pm->reset, 0, sizeof(pm->reset));
pm->rm_list_tx.nr = 0;
pm->rm_list_rx.nr = 0;
WRITE_ONCE(pm->pm_type, pm_type);
if (pm_type == MPTCP_PM_TYPE_KERNEL) {
- bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);
+ bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk);
/* pm->work_pending must be only be set to 'true' when
* pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
*/
WRITE_ONCE(pm->work_pending,
- (!!mptcp_pm_get_local_addr_max(msk) &&
+ (!!mptcp_pm_get_endp_subflow_max(msk) &&
subflows_allowed) ||
- !!mptcp_pm_get_add_addr_signal_max(msk));
+ !!mptcp_pm_get_endp_signal_max(msk));
WRITE_ONCE(pm->accept_addr,
- !!mptcp_pm_get_add_addr_accept_max(msk) &&
+ !!mptcp_pm_get_limit_add_addr_accepted(msk) &&
subflows_allowed);
WRITE_ONCE(pm->accept_subflow, subflows_allowed);
- } else {
- WRITE_ONCE(pm->work_pending, 0);
- WRITE_ONCE(pm->accept_addr, 0);
- WRITE_ONCE(pm->accept_subflow, 0);
- }
- WRITE_ONCE(pm->addr_signal, 0);
- WRITE_ONCE(pm->remote_deny_join_id0, false);
- pm->status = 0;
- bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ bitmap_fill(pm->id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ }
}
void mptcp_pm_data_init(struct mptcp_sock *msk)
@@ -494,5 +1065,75 @@ void mptcp_pm_data_init(struct mptcp_sock *msk)
void __init mptcp_pm_init(void)
{
+ mptcp_pm_kernel_register();
+ mptcp_pm_userspace_register();
mptcp_pm_nl_init();
}
+
+/* Must be called with rcu read lock held */
+struct mptcp_pm_ops *mptcp_pm_find(const char *name)
+{
+ struct mptcp_pm_ops *pm_ops;
+
+ list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) {
+ if (!strcmp(pm_ops->name, name))
+ return pm_ops;
+ }
+
+ return NULL;
+}
+
+int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops)
+{
+ return 0;
+}
+
+int mptcp_pm_register(struct mptcp_pm_ops *pm_ops)
+{
+ int ret;
+
+ ret = mptcp_pm_validate(pm_ops);
+ if (ret)
+ return ret;
+
+ spin_lock(&mptcp_pm_list_lock);
+ if (mptcp_pm_find(pm_ops->name)) {
+ spin_unlock(&mptcp_pm_list_lock);
+ return -EEXIST;
+ }
+ list_add_tail_rcu(&pm_ops->list, &mptcp_pm_list);
+ spin_unlock(&mptcp_pm_list_lock);
+
+ pr_debug("%s registered\n", pm_ops->name);
+ return 0;
+}
+
+void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops)
+{
+ /* skip unregistering the default path manager */
+ if (WARN_ON_ONCE(pm_ops == &mptcp_pm_kernel))
+ return;
+
+ spin_lock(&mptcp_pm_list_lock);
+ list_del_rcu(&pm_ops->list);
+ spin_unlock(&mptcp_pm_list_lock);
+}
+
+/* Build string with list of available path manager values.
+ * Similar to tcp_get_available_congestion_control()
+ */
+void mptcp_pm_get_available(char *buf, size_t maxlen)
+{
+ struct mptcp_pm_ops *pm_ops;
+ size_t offs = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) {
+ offs += snprintf(buf + offs, maxlen - offs, "%s%s",
+ offs == 0 ? "" : " ", pm_ops->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
+ }
+ rcu_read_unlock();
+}
diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
new file mode 100644
index 000000000000..57570a44e418
--- /dev/null
+++ b/net/mptcp/pm_kernel.c
@@ -0,0 +1,1624 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2025, Matthieu Baerts.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <net/netns/generic.h>
+
+#include "protocol.h"
+#include "mib.h"
+#include "mptcp_pm_gen.h"
+
+static int pm_nl_pernet_id;
+
+struct pm_nl_pernet {
+ /* protects pernet updates */
+ spinlock_t lock;
+ struct list_head endp_list;
+ u8 endpoints;
+ u8 endp_signal_max;
+ u8 endp_subflow_max;
+ u8 endp_laminar_max;
+ u8 endp_fullmesh_max;
+ u8 limit_add_addr_accepted;
+ u8 limit_extra_subflows;
+ u8 next_id;
+ DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+};
+
+#define MPTCP_PM_ADDR_MAX 8
+
+static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net)
+{
+ return net_generic(net, pm_nl_pernet_id);
+}
+
+static struct pm_nl_pernet *
+pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk)
+{
+ return pm_nl_get_pernet(sock_net((struct sock *)msk));
+}
+
+static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
+{
+ return pm_nl_get_pernet(genl_info_net(info));
+}
+
+u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk)
+{
+ const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->endp_signal_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max);
+
+u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->endp_subflow_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max);
+
+u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->endp_laminar_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max);
+
+u8 mptcp_pm_get_endp_fullmesh_max(const struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->endp_fullmesh_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_fullmesh_max);
+
+u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->limit_add_addr_accepted);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted);
+
+u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->limit_extra_subflows);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows);
+
+static bool lookup_subflow_by_daddr(const struct list_head *list,
+ const struct mptcp_addr_info *daddr)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_addr_info cur;
+
+ list_for_each_entry(subflow, list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!((1 << inet_sk_state_load(ssk)) &
+ (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV)))
+ continue;
+
+ mptcp_remote_address((struct sock_common *)ssk, &cur);
+ if (mptcp_addresses_equal(&cur, daddr, daddr->port))
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+select_local_address(const struct pm_nl_pernet *pernet,
+ const struct mptcp_sock *msk,
+ struct mptcp_pm_local *new_local)
+{
+ struct mptcp_pm_addr_entry *entry;
+ bool found = false;
+
+ msk_owned_by_me(msk);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
+ continue;
+
+ if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
+ continue;
+
+ new_local->addr = entry->addr;
+ new_local->flags = entry->flags;
+ new_local->ifindex = entry->ifindex;
+ found = true;
+ break;
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+static bool
+select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk,
+ struct mptcp_pm_local *new_local)
+{
+ struct mptcp_pm_addr_entry *entry;
+ bool found = false;
+
+ rcu_read_lock();
+ /* do not keep any additional per socket state, just signal
+ * the address list in order.
+ * Note: removal from the local address list during the msk life-cycle
+ * can lead to additional addresses not being announced.
+ */
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
+ if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
+ continue;
+
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
+ continue;
+
+ new_local->addr = entry->addr;
+ new_local->flags = entry->flags;
+ new_local->ifindex = entry->ifindex;
+ found = true;
+ break;
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+static unsigned int
+fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local,
+ struct mptcp_addr_info *addrs)
+{
+ bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
+ struct mptcp_addr_info remote = { 0 };
+ struct sock *sk = (struct sock *)msk;
+
+ if (deny_id0)
+ return 0;
+
+ mptcp_remote_address((struct sock_common *)sk, &remote);
+
+ if (!mptcp_pm_addr_families_match(sk, local, &remote))
+ return 0;
+
+ msk->pm.extra_subflows++;
+ *addrs = remote;
+
+ return 1;
+}
+
+static unsigned int
+fill_remote_addresses_fullmesh(struct mptcp_sock *msk,
+ struct mptcp_addr_info *local,
+ struct mptcp_addr_info *addrs)
+{
+ u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
+ bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
+ DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+ struct sock *sk = (struct sock *)msk, *ssk;
+ struct mptcp_subflow_context *subflow;
+ int i = 0;
+
+ /* Forbid creation of new subflows matching existing ones, possibly
+ * already created by incoming ADD_ADDR
+ */
+ bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+ mptcp_for_each_subflow(msk, subflow)
+ if (READ_ONCE(subflow->local_id) == local->id)
+ __set_bit(subflow->remote_id, unavail_id);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ mptcp_remote_address((struct sock_common *)ssk, &addrs[i]);
+ addrs[i].id = READ_ONCE(subflow->remote_id);
+ if (deny_id0 && !addrs[i].id)
+ continue;
+
+ if (test_bit(addrs[i].id, unavail_id))
+ continue;
+
+ if (!mptcp_pm_addr_families_match(sk, local, &addrs[i]))
+ continue;
+
+ /* forbid creating multiple address towards this id */
+ __set_bit(addrs[i].id, unavail_id);
+ msk->pm.extra_subflows++;
+ i++;
+
+ if (msk->pm.extra_subflows >= limit_extra_subflows)
+ break;
+ }
+
+ return i;
+}
+
+/* Fill all the remote addresses into the array addrs[],
+ * and return the array size.
+ */
+static unsigned int
+fill_remote_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *local,
+ bool fullmesh, struct mptcp_addr_info *addrs)
+{
+ /* Non-fullmesh: fill in the single entry corresponding to the primary
+ * MPC subflow remote address, and return 1, corresponding to 1 entry.
+ */
+ if (!fullmesh)
+ return fill_remote_addr(msk, local, addrs);
+
+ /* Fullmesh endpoint: fill all possible remote addresses */
+ return fill_remote_addresses_fullmesh(msk, local, addrs);
+}
+
+static struct mptcp_pm_addr_entry *
+__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list,
+ lockdep_is_held(&pernet->lock)) {
+ if (entry->addr.id == id)
+ return entry;
+ }
+ return NULL;
+}
+
+static struct mptcp_pm_addr_entry *
+__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list,
+ lockdep_is_held(&pernet->lock)) {
+ if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port))
+ return entry;
+ }
+ return NULL;
+}
+
+static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ return msk->mpc_endpoint_id == addr->id ? 0 : addr->id;
+}
+
+/* Set mpc_endpoint_id, and send MP_PRIO for ID0 if needed */
+static void mptcp_mpc_endpoint_setup(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_addr_info mpc_addr;
+ struct pm_nl_pernet *pernet;
+ bool backup = false;
+
+ /* do lazy endpoint usage accounting for the MPC subflows */
+ if (likely(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED)) ||
+ !msk->first)
+ return;
+
+ subflow = mptcp_subflow_ctx(msk->first);
+ pernet = pm_nl_get_pernet_from_msk(msk);
+
+ mptcp_local_address((struct sock_common *)msk->first, &mpc_addr);
+ rcu_read_lock();
+ entry = __lookup_addr(pernet, &mpc_addr);
+ if (entry) {
+ __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
+ msk->mpc_endpoint_id = entry->addr.id;
+ backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ }
+ rcu_read_unlock();
+
+ /* Send MP_PRIO */
+ if (backup)
+ mptcp_pm_send_ack(msk, subflow, true, backup);
+
+ msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
+}
+
+static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
+{
+ u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+ u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk);
+ u8 endp_signal_max = mptcp_pm_get_endp_signal_max(msk);
+ struct sock *sk = (struct sock *)msk;
+ bool signal_and_subflow = false;
+ struct mptcp_pm_local local;
+
+ mptcp_mpc_endpoint_setup(msk);
+ if (!mptcp_is_fully_established(sk))
+ return;
+
+ pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
+ msk->pm.local_addr_used, endp_subflow_max,
+ msk->pm.add_addr_signaled, endp_signal_max,
+ msk->pm.extra_subflows, limit_extra_subflows);
+
+ /* check first for announce */
+ if (msk->pm.add_addr_signaled < endp_signal_max) {
+ /* due to racing events on both ends we can reach here while
+ * previous add address is still running: if we invoke now
+ * mptcp_pm_announce_addr(), that will fail and the
+ * corresponding id will be marked as used.
+ * Instead let the PM machinery reschedule us when the
+ * current address announce will be completed.
+ */
+ if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL))
+ return;
+
+ if (!select_signal_address(pernet, msk, &local))
+ goto subflow;
+
+ /* If the alloc fails, we are on memory pressure, not worth
+ * continuing, and trying to create subflows.
+ */
+ if (!mptcp_pm_alloc_anno_list(msk, &local.addr))
+ return;
+
+ __clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
+ msk->pm.add_addr_signaled++;
+
+ /* Special case for ID0: set the correct ID */
+ if (local.addr.id == msk->mpc_endpoint_id)
+ local.addr.id = 0;
+
+ mptcp_pm_announce_addr(msk, &local.addr, false);
+ mptcp_pm_addr_send_ack(msk);
+
+ if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ signal_and_subflow = true;
+ }
+
+subflow:
+ /* No need to try establishing subflows to remote id0 if not allowed */
+ if (mptcp_pm_add_addr_c_flag_case(msk))
+ goto exit;
+
+ /* check if should create a new subflow */
+ while (msk->pm.local_addr_used < endp_subflow_max &&
+ msk->pm.extra_subflows < limit_extra_subflows) {
+ struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
+ bool fullmesh;
+ int i, nr;
+
+ if (signal_and_subflow)
+ signal_and_subflow = false;
+ else if (!select_local_address(pernet, msk, &local))
+ break;
+
+ fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
+
+ __clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
+
+ /* Special case for ID0: set the correct ID */
+ if (local.addr.id == msk->mpc_endpoint_id)
+ local.addr.id = 0;
+ else /* local_addr_used is not decr for ID 0 */
+ msk->pm.local_addr_used++;
+
+ nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs);
+ if (nr == 0)
+ continue;
+
+ spin_unlock_bh(&msk->pm.lock);
+ for (i = 0; i < nr; i++)
+ __mptcp_subflow_connect(sk, &local, &addrs[i]);
+ spin_lock_bh(&msk->pm.lock);
+ }
+
+exit:
+ mptcp_pm_nl_check_work_pending(msk);
+}
+
+static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
+{
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
+{
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+static unsigned int
+fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk,
+ struct mptcp_addr_info *remote,
+ struct mptcp_pm_local *locals,
+ bool c_flag_case)
+{
+ u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_pm_local *local;
+ int i = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
+ bool is_id0;
+
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
+ continue;
+
+ if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
+ continue;
+
+ local = &locals[i];
+ local->addr = entry->addr;
+ local->flags = entry->flags;
+ local->ifindex = entry->ifindex;
+
+ is_id0 = local->addr.id == msk->mpc_endpoint_id;
+
+ if (c_flag_case &&
+ (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) {
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+
+ if (!is_id0)
+ msk->pm.local_addr_used++;
+ }
+
+ /* Special case for ID0: set the correct ID */
+ if (is_id0)
+ local->addr.id = 0;
+
+ msk->pm.extra_subflows++;
+ i++;
+
+ if (msk->pm.extra_subflows >= limit_extra_subflows)
+ break;
+ }
+ rcu_read_unlock();
+
+ return i;
+}
+
+static unsigned int
+fill_local_laminar_endp(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
+ struct mptcp_pm_local *locals)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+ DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_pm_local *local;
+ int found = 0;
+
+ /* Forbid creation of new subflows matching existing ones, possibly
+ * already created by 'subflow' endpoints
+ */
+ bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if ((1 << inet_sk_state_load(ssk)) &
+ (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING |
+ TCPF_CLOSE))
+ continue;
+
+ __set_bit(subflow_get_local_id(subflow), unavail_id);
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR))
+ continue;
+
+ if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
+ continue;
+
+ if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr),
+ unavail_id))
+ continue;
+
+ local = &locals[0];
+ local->addr = entry->addr;
+ local->flags = entry->flags;
+ local->ifindex = entry->ifindex;
+
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+
+ if (local->addr.id != msk->mpc_endpoint_id)
+ msk->pm.local_addr_used++;
+ }
+
+ msk->pm.extra_subflows++;
+ found = 1;
+ break;
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+static unsigned int
+fill_local_addresses_vec_c_flag(struct mptcp_sock *msk,
+ struct mptcp_addr_info *remote,
+ struct mptcp_pm_local *locals)
+{
+ u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+ u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk);
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_local *local;
+ int i = 0;
+
+ while (msk->pm.local_addr_used < endp_subflow_max) {
+ local = &locals[i];
+
+ if (!select_local_address(pernet, msk, local))
+ break;
+
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+
+ if (!mptcp_pm_addr_families_match(sk, &local->addr, remote))
+ continue;
+
+ if (local->addr.id == msk->mpc_endpoint_id)
+ continue;
+
+ msk->pm.local_addr_used++;
+ msk->pm.extra_subflows++;
+ i++;
+
+ if (msk->pm.extra_subflows >= limit_extra_subflows)
+ break;
+ }
+
+ return i;
+}
+
+static unsigned int
+fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
+ struct mptcp_pm_local *local)
+{
+ struct sock *sk = (struct sock *)msk;
+
+ memset(local, 0, sizeof(*local));
+ local->addr.family =
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ remote->family == AF_INET6 &&
+ ipv6_addr_v4mapped(&remote->addr6) ? AF_INET :
+#endif
+ remote->family;
+
+ if (!mptcp_pm_addr_families_match(sk, &local->addr, remote))
+ return 0;
+
+ msk->pm.extra_subflows++;
+
+ return 1;
+}
+
+/* Fill all the local addresses into the array addrs[],
+ * and return the array size.
+ */
+static unsigned int
+fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
+ struct mptcp_pm_local *locals)
+{
+ bool c_flag_case = remote->id && mptcp_pm_add_addr_c_flag_case(msk);
+
+ /* If there is at least one MPTCP endpoint with a fullmesh flag */
+ if (mptcp_pm_get_endp_fullmesh_max(msk))
+ return fill_local_addresses_vec_fullmesh(msk, remote, locals,
+ c_flag_case);
+
+ /* If there is at least one MPTCP endpoint with a laminar flag */
+ if (mptcp_pm_get_endp_laminar_max(msk))
+ return fill_local_laminar_endp(msk, remote, locals);
+
+ /* Special case: peer sets the C flag, accept one ADD_ADDR if default
+ * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints
+ */
+ if (c_flag_case)
+ return fill_local_addresses_vec_c_flag(msk, remote, locals);
+
+ /* No special case: fill in the single 'IPADDRANY' local address */
+ return fill_local_address_any(msk, remote, &locals[0]);
+}
+
+static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
+{
+ u8 limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk);
+ u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
+ struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX];
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_addr_info remote;
+ bool sf_created = false;
+ int i, nr;
+
+ pr_debug("accepted %d:%d remote family %d\n",
+ msk->pm.add_addr_accepted, limit_add_addr_accepted,
+ msk->pm.remote.family);
+
+ remote = msk->pm.remote;
+ mptcp_pm_announce_addr(msk, &remote, true);
+ mptcp_pm_addr_send_ack(msk);
+ mptcp_mpc_endpoint_setup(msk);
+
+ if (lookup_subflow_by_daddr(&msk->conn_list, &remote))
+ return;
+
+ /* pick id 0 port, if none is provided the remote address */
+ if (!remote.port)
+ remote.port = sk->sk_dport;
+
+ /* connect to the specified remote address, using whatever
+ * local address the routing configuration will pick.
+ */
+ nr = fill_local_addresses_vec(msk, &remote, locals);
+ if (nr == 0)
+ return;
+
+ spin_unlock_bh(&msk->pm.lock);
+ for (i = 0; i < nr; i++)
+ if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0)
+ sf_created = true;
+ spin_lock_bh(&msk->pm.lock);
+
+ if (sf_created) {
+ /* add_addr_accepted is not decr for ID 0 */
+ if (remote.id)
+ msk->pm.add_addr_accepted++;
+ if (msk->pm.add_addr_accepted >= limit_add_addr_accepted ||
+ msk->pm.extra_subflows >= limit_extra_subflows)
+ WRITE_ONCE(msk->pm.accept_addr, false);
+ }
+}
+
+void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id)
+{
+ if (rm_id && !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) {
+ u8 limit_add_addr_accepted =
+ mptcp_pm_get_limit_add_addr_accepted(msk);
+
+ /* Note: if the subflow has been closed before, this
+ * add_addr_accepted counter will not be decremented.
+ */
+ if (--msk->pm.add_addr_accepted < limit_add_addr_accepted)
+ WRITE_ONCE(msk->pm.accept_addr, true);
+ }
+}
+
+static bool address_use_port(struct mptcp_pm_addr_entry *entry)
+{
+ return (entry->flags &
+ (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
+ MPTCP_PM_ADDR_FLAG_SIGNAL;
+}
+
+/* caller must ensure the RCU grace period is already elapsed */
+static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
+{
+ if (entry->lsk)
+ sock_release(entry->lsk);
+ kfree(entry);
+}
+
+static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
+ struct mptcp_pm_addr_entry *entry,
+ bool needs_id, bool replace)
+{
+ struct mptcp_pm_addr_entry *cur, *del_entry = NULL;
+ int ret = -EINVAL;
+ u8 addr_max;
+
+ spin_lock_bh(&pernet->lock);
+ /* to keep the code simple, don't do IDR-like allocation for address ID,
+ * just bail when we exceed limits
+ */
+ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
+ pernet->next_id = 1;
+ if (pernet->endpoints >= MPTCP_PM_ADDR_MAX) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (test_bit(entry->addr.id, pernet->id_bitmap)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* do not insert duplicate address, differentiate on port only
+ * singled addresses
+ */
+ if (!address_use_port(entry))
+ entry->addr.port = 0;
+ list_for_each_entry(cur, &pernet->endp_list, list) {
+ if (mptcp_addresses_equal(&cur->addr, &entry->addr,
+ cur->addr.port || entry->addr.port)) {
+ /* allow replacing the exiting endpoint only if such
+ * endpoint is an implicit one and the user-space
+ * did not provide an endpoint id
+ */
+ if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ if (entry->addr.id)
+ goto out;
+
+ /* allow callers that only need to look up the local
+ * addr's id to skip replacement. This allows them to
+ * avoid calling synchronize_rcu in the packet recv
+ * path.
+ */
+ if (!replace) {
+ kfree(entry);
+ ret = cur->addr.id;
+ goto out;
+ }
+
+ pernet->endpoints--;
+ entry->addr.id = cur->addr.id;
+ list_del_rcu(&cur->list);
+ del_entry = cur;
+ break;
+ }
+ }
+
+ if (!entry->addr.id && needs_id) {
+find_next:
+ entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
+ MPTCP_PM_MAX_ADDR_ID + 1,
+ pernet->next_id);
+ if (!entry->addr.id && pernet->next_id != 1) {
+ pernet->next_id = 1;
+ goto find_next;
+ }
+ }
+
+ if (!entry->addr.id && needs_id)
+ goto out;
+
+ __set_bit(entry->addr.id, pernet->id_bitmap);
+ if (entry->addr.id > pernet->next_id)
+ pernet->next_id = entry->addr.id;
+
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ addr_max = pernet->endp_signal_max;
+ WRITE_ONCE(pernet->endp_signal_max, addr_max + 1);
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ addr_max = pernet->endp_subflow_max;
+ WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1);
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) {
+ addr_max = pernet->endp_laminar_max;
+ WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1);
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) {
+ addr_max = pernet->endp_fullmesh_max;
+ WRITE_ONCE(pernet->endp_fullmesh_max, addr_max + 1);
+ }
+
+ pernet->endpoints++;
+ if (!entry->addr.port)
+ list_add_tail_rcu(&entry->list, &pernet->endp_list);
+ else
+ list_add_rcu(&entry->list, &pernet->endp_list);
+ ret = entry->addr.id;
+
+out:
+ spin_unlock_bh(&pernet->lock);
+
+ /* just replaced an existing entry, free it */
+ if (del_entry) {
+ synchronize_rcu();
+ __mptcp_pm_release_addr_entry(del_entry);
+ }
+ return ret;
+}
+
+static struct lock_class_key mptcp_slock_keys[2];
+static struct lock_class_key mptcp_keys[2];
+
+static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ bool is_ipv6 = sk->sk_family == AF_INET6;
+ int addrlen = sizeof(struct sockaddr_in);
+ struct sockaddr_storage addr;
+ struct sock *newsk, *ssk;
+ int backlog = 1024;
+ int err;
+
+ err = sock_create_kern(sock_net(sk), entry->addr.family,
+ SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk);
+ if (err)
+ return err;
+
+ newsk = entry->lsk->sk;
+ if (!newsk)
+ return -EINVAL;
+
+ /* The subflow socket lock is acquired in a nested to the msk one
+ * in several places, even by the TCP stack, and this msk is a kernel
+ * socket: lockdep complains. Instead of propagating the _nested
+ * modifiers in several places, re-init the lock class for the msk
+ * socket to an mptcp specific one.
+ */
+ sock_lock_init_class_and_name(newsk,
+ is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET",
+ &mptcp_slock_keys[is_ipv6],
+ is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET",
+ &mptcp_keys[is_ipv6]);
+
+ lock_sock(newsk);
+ ssk = __mptcp_nmpc_sk(mptcp_sk(newsk));
+ release_sock(newsk);
+ if (IS_ERR(ssk))
+ return PTR_ERR(ssk);
+
+ mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (entry->addr.family == AF_INET6)
+ addrlen = sizeof(struct sockaddr_in6);
+#endif
+ if (ssk->sk_family == AF_INET)
+ err = inet_bind_sk(ssk, (struct sockaddr_unsized *)&addr, addrlen);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (ssk->sk_family == AF_INET6)
+ err = inet6_bind_sk(ssk, (struct sockaddr_unsized *)&addr, addrlen);
+#endif
+ if (err)
+ return err;
+
+ /* We don't use mptcp_set_state() here because it needs to be called
+ * under the msk socket lock. For the moment, that will not bring
+ * anything more than only calling inet_sk_state_store(), because the
+ * old status is known (TCP_CLOSE).
+ */
+ inet_sk_state_store(newsk, TCP_LISTEN);
+ lock_sock(ssk);
+ WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true);
+ err = __inet_listen_sk(ssk, backlog);
+ if (!err)
+ mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
+ release_sock(ssk);
+ return err;
+}
+
+int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *skc)
+{
+ struct mptcp_pm_addr_entry *entry;
+ struct pm_nl_pernet *pernet;
+ int ret;
+
+ pernet = pm_nl_get_pernet_from_msk(msk);
+
+ rcu_read_lock();
+ entry = __lookup_addr(pernet, &skc->addr);
+ ret = entry ? entry->addr.id : -1;
+ rcu_read_unlock();
+ if (ret >= 0)
+ return ret;
+
+ /* address not found, add to local list */
+ entry = kmemdup(skc, sizeof(*skc), GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->addr.port = 0;
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false);
+ if (ret < 0)
+ kfree(entry);
+
+ return ret;
+}
+
+bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+ struct mptcp_pm_addr_entry *entry;
+ bool backup;
+
+ rcu_read_lock();
+ entry = __lookup_addr(pernet, skc);
+ backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ rcu_read_unlock();
+
+ return backup;
+}
+
+static int mptcp_nl_add_subflow_or_signal_addr(struct net *net,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_sock *msk;
+ long s_slot = 0, s_num = 0;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_addr_info mpc_addr;
+
+ if (!READ_ONCE(msk->fully_established) ||
+ mptcp_pm_is_userspace(msk))
+ goto next;
+
+ /* if the endp linked to the init sf is re-added with a != ID */
+ mptcp_local_address((struct sock_common *)msk, &mpc_addr);
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ if (mptcp_addresses_equal(addr, &mpc_addr, addr->port))
+ msk->mpc_endpoint_id = addr->id;
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr,
+ struct genl_info *info)
+{
+ struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
+
+ if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
+ mptcp_pm_address_nl_policy, info->extack) &&
+ tb[MPTCP_PM_ADDR_ATTR_ID])
+ return true;
+ return false;
+}
+
+/* Add an MPTCP endpoint */
+int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ struct nlattr *attr;
+ int ret;
+
+ if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR))
+ return -EINVAL;
+
+ attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
+ ret = mptcp_pm_parse_entry(attr, info, true, &addr);
+ if (ret < 0)
+ return ret;
+
+ if (addr.addr.port && !address_use_port(&addr)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "flags must have signal and not subflow when using port");
+ return -EINVAL;
+ }
+
+ if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL &&
+ addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "flags mustn't have both signal and fullmesh");
+ return -EINVAL;
+ }
+
+ if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "can't create IMPLICIT endpoint");
+ return -EINVAL;
+ }
+
+ entry = kmemdup(&addr, sizeof(addr), GFP_KERNEL_ACCOUNT);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "can't allocate addr");
+ return -ENOMEM;
+ }
+
+ if (entry->addr.port) {
+ ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
+ if (ret) {
+ GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret);
+ goto out_free;
+ }
+ }
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry,
+ !mptcp_pm_has_addr_attr_id(attr, info),
+ true);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret);
+ goto out_free;
+ }
+
+ mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr);
+ return 0;
+
+out_free:
+ __mptcp_pm_release_addr_entry(entry);
+ return ret;
+}
+
+static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr,
+ bool force)
+{
+ struct mptcp_rm_list list = { .nr = 0 };
+ bool ret;
+
+ list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr);
+
+ ret = mptcp_remove_anno_list_by_saddr(msk, addr);
+ if (ret || force) {
+ spin_lock_bh(&msk->pm.lock);
+ if (ret) {
+ __set_bit(addr->id, msk->pm.id_avail_bitmap);
+ msk->pm.add_addr_signaled--;
+ }
+ mptcp_pm_remove_addr(msk, &list);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+ return ret;
+}
+
+static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id)
+{
+ /* If it was marked as used, and not ID 0, decrement local_addr_used */
+ if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) &&
+ id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0))
+ msk->pm.local_addr_used--;
+}
+
+static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
+ const struct mptcp_pm_addr_entry *entry)
+{
+ const struct mptcp_addr_info *addr = &entry->addr;
+ struct mptcp_rm_list list = { .nr = 1 };
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+
+ pr_debug("remove_id=%d\n", addr->id);
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+ bool remove_subflow;
+
+ if (mptcp_pm_is_userspace(msk))
+ goto next;
+
+ lock_sock(sk);
+ remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr);
+ mptcp_pm_remove_anno_addr(msk, addr, remove_subflow &&
+ !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT));
+
+ list.ids[0] = mptcp_endp_get_local_id(msk, addr);
+ if (remove_subflow) {
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_rm_subflow(msk, &list);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ spin_lock_bh(&msk->pm.lock);
+ __mark_subflow_endp_available(msk, list.ids[0]);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+
+ if (msk->mpc_endpoint_id == entry->addr.id)
+ msk->mpc_endpoint_id = 0;
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static int mptcp_nl_remove_id_zero_address(struct net *net,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_rm_list list = { .nr = 0 };
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+
+ list.ids[list.nr++] = 0;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_addr_info msk_local;
+
+ if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
+ goto next;
+
+ mptcp_local_address((struct sock_common *)msk, &msk_local);
+ if (!mptcp_addresses_equal(&msk_local, addr, addr->port))
+ goto next;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_remove_addr(msk, &list);
+ mptcp_pm_rm_subflow(msk, &list);
+ __mark_subflow_endp_available(msk, 0);
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+/* Remove an MPTCP endpoint */
+int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ struct nlattr *attr;
+ u8 addr_max;
+ int ret;
+
+ if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR))
+ return -EINVAL;
+
+ attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
+ ret = mptcp_pm_parse_entry(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
+
+ /* the zero id address is special: the first address used by the msk
+ * always gets such an id, so different subflows can have different zero
+ * id addresses. Additionally zero id is not accounted for in id_bitmap.
+ * Let's use an 'mptcp_rm_list' instead of the common remove code.
+ */
+ if (addr.addr.id == 0)
+ return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr);
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr_by_id(pernet, addr.addr.id);
+ if (!entry) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found");
+ spin_unlock_bh(&pernet->lock);
+ return -EINVAL;
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ addr_max = pernet->endp_signal_max;
+ WRITE_ONCE(pernet->endp_signal_max, addr_max - 1);
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ addr_max = pernet->endp_subflow_max;
+ WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1);
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) {
+ addr_max = pernet->endp_laminar_max;
+ WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1);
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) {
+ addr_max = pernet->endp_fullmesh_max;
+ WRITE_ONCE(pernet->endp_fullmesh_max, addr_max - 1);
+ }
+
+ pernet->endpoints--;
+ list_del_rcu(&entry->list);
+ __clear_bit(entry->addr.id, pernet->id_bitmap);
+ spin_unlock_bh(&pernet->lock);
+
+ mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry);
+ synchronize_rcu();
+ __mptcp_pm_release_addr_entry(entry);
+
+ return ret;
+}
+
+static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk,
+ struct list_head *rm_list)
+{
+ struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 };
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry(entry, rm_list, list) {
+ if (slist.nr < MPTCP_RM_IDS_MAX &&
+ mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr))
+ slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr);
+
+ if (alist.nr < MPTCP_RM_IDS_MAX &&
+ mptcp_remove_anno_list_by_saddr(msk, &entry->addr))
+ alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr);
+ }
+
+ spin_lock_bh(&msk->pm.lock);
+ if (alist.nr) {
+ msk->pm.add_addr_signaled -= alist.nr;
+ mptcp_pm_remove_addr(msk, &alist);
+ }
+ if (slist.nr)
+ mptcp_pm_rm_subflow(msk, &slist);
+ /* Reset counters: maybe some subflows have been removed before */
+ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ msk->pm.local_addr_used = 0;
+ spin_unlock_bh(&msk->pm.lock);
+}
+
+static void mptcp_nl_flush_addrs_list(struct net *net,
+ struct list_head *rm_list)
+{
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+
+ if (list_empty(rm_list))
+ return;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ lock_sock(sk);
+ mptcp_pm_flush_addrs_and_subflows(msk, rm_list);
+ release_sock(sk);
+ }
+
+ sock_put(sk);
+ cond_resched();
+ }
+}
+
+/* caller must ensure the RCU grace period is already elapsed */
+static void __flush_addrs(struct list_head *list)
+{
+ while (!list_empty(list)) {
+ struct mptcp_pm_addr_entry *cur;
+
+ cur = list_entry(list->next,
+ struct mptcp_pm_addr_entry, list);
+ list_del_rcu(&cur->list);
+ __mptcp_pm_release_addr_entry(cur);
+ }
+}
+
+static void __reset_counters(struct pm_nl_pernet *pernet)
+{
+ WRITE_ONCE(pernet->endp_signal_max, 0);
+ WRITE_ONCE(pernet->endp_subflow_max, 0);
+ WRITE_ONCE(pernet->endp_laminar_max, 0);
+ pernet->endpoints = 0;
+}
+
+int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ LIST_HEAD(free_list);
+
+ spin_lock_bh(&pernet->lock);
+ list_splice_init(&pernet->endp_list, &free_list);
+ __reset_counters(pernet);
+ pernet->next_id = 1;
+ bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ spin_unlock_bh(&pernet->lock);
+ mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list);
+ synchronize_rcu();
+ __flush_addrs(&free_list);
+ return 0;
+}
+
+int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr,
+ struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry *entry;
+ int ret = -EINVAL;
+
+ rcu_read_lock();
+ entry = __lookup_addr_by_id(pernet, id);
+ if (entry) {
+ *addr = *entry;
+ ret = 0;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(msg->sk);
+ struct mptcp_pm_addr_entry *entry;
+ struct pm_nl_pernet *pernet;
+ int id = cb->args[0];
+ int i;
+
+ pernet = pm_nl_get_pernet(net);
+
+ rcu_read_lock();
+ for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
+ if (test_bit(i, pernet->id_bitmap)) {
+ entry = __lookup_addr_by_id(pernet, i);
+ if (!entry)
+ break;
+
+ if (entry->addr.id <= id)
+ continue;
+
+ if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0)
+ break;
+
+ id = entry->addr.id;
+ }
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = id;
+ return msg->len;
+}
+
+static int parse_limit(struct genl_info *info, int id, unsigned int *limit)
+{
+ struct nlattr *attr = info->attrs[id];
+
+ if (!attr)
+ return 0;
+
+ *limit = nla_get_u32(attr);
+ if (*limit > MPTCP_PM_ADDR_MAX) {
+ NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr,
+ "limit greater than maximum (%u)",
+ MPTCP_PM_ADDR_MAX);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ unsigned int rcv_addrs, subflows;
+ int ret;
+
+ spin_lock_bh(&pernet->lock);
+ rcv_addrs = pernet->limit_add_addr_accepted;
+ ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
+ if (ret)
+ goto unlock;
+
+ subflows = pernet->limit_extra_subflows;
+ ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
+ if (ret)
+ goto unlock;
+
+ WRITE_ONCE(pernet->limit_add_addr_accepted, rcv_addrs);
+ WRITE_ONCE(pernet->limit_extra_subflows, subflows);
+
+unlock:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct sk_buff *msg;
+ void *reply;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
+ MPTCP_PM_CMD_GET_LIMITS);
+ if (!reply)
+ goto fail;
+
+ if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
+ READ_ONCE(pernet->limit_add_addr_accepted)))
+ goto fail;
+
+ if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
+ READ_ONCE(pernet->limit_extra_subflows)))
+ goto fail;
+
+ genlmsg_end(msg, reply);
+ return genlmsg_reply(msg, info);
+
+fail:
+ GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_rm_list list = { .nr = 0 };
+
+ list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr);
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_rm_subflow(msk, &list);
+ __mark_subflow_endp_available(msk, list.ids[0]);
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+ spin_unlock_bh(&msk->pm.lock);
+}
+
+static void mptcp_pm_nl_set_flags_all(struct net *net,
+ struct mptcp_pm_addr_entry *local,
+ u8 changed)
+{
+ u8 is_subflow = !!(local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW);
+ u8 bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+
+ if (changed == MPTCP_PM_ADDR_FLAG_FULLMESH && !is_subflow)
+ return;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+
+ if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
+ goto next;
+
+ lock_sock(sk);
+ if (changed & MPTCP_PM_ADDR_FLAG_BACKUP)
+ mptcp_pm_mp_prio_send_ack(msk, &local->addr, NULL, bkup);
+ /* Subflows will only be recreated if the SUBFLOW flag is set */
+ if (is_subflow && (changed & MPTCP_PM_ADDR_FLAG_FULLMESH))
+ mptcp_pm_nl_fullmesh(msk, &local->addr);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+}
+
+int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local,
+ struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP |
+ MPTCP_PM_ADDR_FLAG_FULLMESH;
+ struct net *net = genl_info_net(info);
+ struct mptcp_pm_addr_entry *entry;
+ struct pm_nl_pernet *pernet;
+ u8 lookup_by_id = 0;
+
+ pernet = pm_nl_get_pernet(net);
+
+ if (local->addr.family == AF_UNSPEC) {
+ lookup_by_id = 1;
+ if (!local->addr.id) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "missing address ID");
+ return -EOPNOTSUPP;
+ }
+ }
+
+ spin_lock_bh(&pernet->lock);
+ entry = lookup_by_id ? __lookup_addr_by_id(pernet, local->addr.id) :
+ __lookup_addr(pernet, &local->addr);
+ if (!entry) {
+ spin_unlock_bh(&pernet->lock);
+ NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found");
+ return -EINVAL;
+ }
+ if ((local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
+ (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL |
+ MPTCP_PM_ADDR_FLAG_IMPLICIT))) {
+ spin_unlock_bh(&pernet->lock);
+ NL_SET_ERR_MSG_ATTR(info->extack, attr, "invalid addr flags");
+ return -EINVAL;
+ }
+
+ changed = (local->flags ^ entry->flags) & mask;
+ entry->flags = (entry->flags & ~mask) | (local->flags & mask);
+ *local = *entry;
+
+ if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) {
+ u8 addr_max = pernet->endp_fullmesh_max;
+
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)
+ addr_max++;
+ else
+ addr_max--;
+
+ WRITE_ONCE(pernet->endp_fullmesh_max, addr_max);
+ }
+
+ spin_unlock_bh(&pernet->lock);
+
+ mptcp_pm_nl_set_flags_all(net, local, changed);
+ return 0;
+}
+
+bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ if (msk->pm.extra_subflows == mptcp_pm_get_limit_extra_subflows(msk) ||
+ (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
+ MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) {
+ WRITE_ONCE(msk->pm.work_pending, false);
+ return false;
+ }
+ return true;
+}
+
+/* Called under PM lock */
+void __mptcp_pm_kernel_worker(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
+ mptcp_pm_nl_add_addr_received(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
+ mptcp_pm_nl_fully_established(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
+ mptcp_pm_nl_subflow_established(msk);
+ }
+}
+
+static int __net_init pm_nl_init_net(struct net *net)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
+
+ INIT_LIST_HEAD_RCU(&pernet->endp_list);
+
+ /* Cit. 2 subflows ought to be enough for anybody. */
+ pernet->limit_extra_subflows = 2;
+ pernet->next_id = 1;
+ spin_lock_init(&pernet->lock);
+
+ /* No need to initialize other pernet fields, the struct is zeroed at
+ * allocation time.
+ */
+
+ return 0;
+}
+
+static void __net_exit pm_nl_exit_net(struct list_head *net_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_list, exit_list) {
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
+
+ /* net is removed from namespace list, can't race with
+ * other modifiers, also netns core already waited for a
+ * RCU grace period.
+ */
+ __flush_addrs(&pernet->endp_list);
+ }
+}
+
+static struct pernet_operations mptcp_pm_pernet_ops = {
+ .init = pm_nl_init_net,
+ .exit_batch = pm_nl_exit_net,
+ .id = &pm_nl_pernet_id,
+ .size = sizeof(struct pm_nl_pernet),
+};
+
+struct mptcp_pm_ops mptcp_pm_kernel = {
+ .name = "kernel",
+ .owner = THIS_MODULE,
+};
+
+void __init mptcp_pm_kernel_register(void)
+{
+ if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
+ panic("Failed to register MPTCP PM pernet subsystem.\n");
+
+ mptcp_pm_register(&mptcp_pm_kernel);
+}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 2ea7eae43bdb..d5b383870f79 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -6,1086 +6,8 @@
#define pr_fmt(fmt) "MPTCP: " fmt
-#include <linux/inet.h>
-#include <linux/kernel.h>
-#include <net/tcp.h>
-#include <net/netns/generic.h>
-#include <net/mptcp.h>
-#include <net/genetlink.h>
-#include <uapi/linux/mptcp.h>
-
#include "protocol.h"
-#include "mib.h"
-
-/* forward declaration */
-static struct genl_family mptcp_genl_family;
-
-static int pm_nl_pernet_id;
-
-struct mptcp_pm_add_entry {
- struct list_head list;
- struct mptcp_addr_info addr;
- struct timer_list add_timer;
- struct mptcp_sock *sock;
- u8 retrans_times;
-};
-
-struct pm_nl_pernet {
- /* protects pernet updates */
- spinlock_t lock;
- struct list_head local_addr_list;
- unsigned int addrs;
- unsigned int stale_loss_cnt;
- unsigned int add_addr_signal_max;
- unsigned int add_addr_accept_max;
- unsigned int local_addr_max;
- unsigned int subflows_max;
- unsigned int next_id;
- DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
-};
-
-#define MPTCP_PM_ADDR_MAX 8
-#define ADD_ADDR_RETRANS_MAX 3
-
-static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net)
-{
- return net_generic(net, pm_nl_pernet_id);
-}
-
-static struct pm_nl_pernet *
-pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk)
-{
- return pm_nl_get_pernet(sock_net((struct sock *)msk));
-}
-
-bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
- const struct mptcp_addr_info *b, bool use_port)
-{
- bool addr_equals = false;
-
- if (a->family == b->family) {
- if (a->family == AF_INET)
- addr_equals = a->addr.s_addr == b->addr.s_addr;
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- else
- addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
- } else if (a->family == AF_INET) {
- if (ipv6_addr_v4mapped(&b->addr6))
- addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3];
- } else if (b->family == AF_INET) {
- if (ipv6_addr_v4mapped(&a->addr6))
- addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr;
-#endif
- }
-
- if (!addr_equals)
- return false;
- if (!use_port)
- return true;
-
- return a->port == b->port;
-}
-
-static void local_address(const struct sock_common *skc,
- struct mptcp_addr_info *addr)
-{
- addr->family = skc->skc_family;
- addr->port = htons(skc->skc_num);
- if (addr->family == AF_INET)
- addr->addr.s_addr = skc->skc_rcv_saddr;
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- else if (addr->family == AF_INET6)
- addr->addr6 = skc->skc_v6_rcv_saddr;
-#endif
-}
-
-static void remote_address(const struct sock_common *skc,
- struct mptcp_addr_info *addr)
-{
- addr->family = skc->skc_family;
- addr->port = skc->skc_dport;
- if (addr->family == AF_INET)
- addr->addr.s_addr = skc->skc_daddr;
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- else if (addr->family == AF_INET6)
- addr->addr6 = skc->skc_v6_daddr;
-#endif
-}
-
-static bool lookup_subflow_by_saddr(const struct list_head *list,
- const struct mptcp_addr_info *saddr)
-{
- struct mptcp_subflow_context *subflow;
- struct mptcp_addr_info cur;
- struct sock_common *skc;
-
- list_for_each_entry(subflow, list, node) {
- skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
-
- local_address(skc, &cur);
- if (mptcp_addresses_equal(&cur, saddr, saddr->port))
- return true;
- }
-
- return false;
-}
-
-static bool lookup_subflow_by_daddr(const struct list_head *list,
- const struct mptcp_addr_info *daddr)
-{
- struct mptcp_subflow_context *subflow;
- struct mptcp_addr_info cur;
- struct sock_common *skc;
-
- list_for_each_entry(subflow, list, node) {
- skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
-
- remote_address(skc, &cur);
- if (mptcp_addresses_equal(&cur, daddr, daddr->port))
- return true;
- }
-
- return false;
-}
-
-static struct mptcp_pm_addr_entry *
-select_local_address(const struct pm_nl_pernet *pernet,
- const struct mptcp_sock *msk)
-{
- const struct sock *sk = (const struct sock *)msk;
- struct mptcp_pm_addr_entry *entry, *ret = NULL;
-
- msk_owned_by_me(msk);
-
- rcu_read_lock();
- list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
- continue;
-
- if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
- continue;
-
- if (entry->addr.family != sk->sk_family) {
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if ((entry->addr.family == AF_INET &&
- !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) ||
- (sk->sk_family == AF_INET &&
- !ipv6_addr_v4mapped(&entry->addr.addr6)))
-#endif
- continue;
- }
-
- ret = entry;
- break;
- }
- rcu_read_unlock();
- return ret;
-}
-
-static struct mptcp_pm_addr_entry *
-select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk)
-{
- struct mptcp_pm_addr_entry *entry, *ret = NULL;
-
- rcu_read_lock();
- /* do not keep any additional per socket state, just signal
- * the address list in order.
- * Note: removal from the local address list during the msk life-cycle
- * can lead to additional addresses not being announced.
- */
- list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
- continue;
-
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
- continue;
-
- ret = entry;
- break;
- }
- rcu_read_unlock();
- return ret;
-}
-
-unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk)
-{
- const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
-
- return READ_ONCE(pernet->add_addr_signal_max);
-}
-EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max);
-
-unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk)
-{
- struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
-
- return READ_ONCE(pernet->add_addr_accept_max);
-}
-EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max);
-
-unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk)
-{
- struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
-
- return READ_ONCE(pernet->subflows_max);
-}
-EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max);
-
-unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk)
-{
- struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
-
- return READ_ONCE(pernet->local_addr_max);
-}
-EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max);
-
-bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
-{
- struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
-
- if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) ||
- (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
- MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) {
- WRITE_ONCE(msk->pm.work_pending, false);
- return false;
- }
- return true;
-}
-
-struct mptcp_pm_add_entry *
-mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr)
-{
- struct mptcp_pm_add_entry *entry;
-
- lockdep_assert_held(&msk->pm.lock);
-
- list_for_each_entry(entry, &msk->pm.anno_list, list) {
- if (mptcp_addresses_equal(&entry->addr, addr, true))
- return entry;
- }
-
- return NULL;
-}
-
-bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk)
-{
- struct mptcp_pm_add_entry *entry;
- struct mptcp_addr_info saddr;
- bool ret = false;
-
- local_address((struct sock_common *)sk, &saddr);
-
- spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(entry, &msk->pm.anno_list, list) {
- if (mptcp_addresses_equal(&entry->addr, &saddr, true)) {
- ret = true;
- goto out;
- }
- }
-
-out:
- spin_unlock_bh(&msk->pm.lock);
- return ret;
-}
-
-static void mptcp_pm_add_timer(struct timer_list *timer)
-{
- struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer);
- struct mptcp_sock *msk = entry->sock;
- struct sock *sk = (struct sock *)msk;
-
- pr_debug("msk=%p", msk);
-
- if (!msk)
- return;
-
- if (inet_sk_state_load(sk) == TCP_CLOSE)
- return;
-
- if (!entry->addr.id)
- return;
-
- if (mptcp_pm_should_add_signal_addr(msk)) {
- sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
- goto out;
- }
-
- spin_lock_bh(&msk->pm.lock);
-
- if (!mptcp_pm_should_add_signal_addr(msk)) {
- pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id);
- mptcp_pm_announce_addr(msk, &entry->addr, false);
- mptcp_pm_add_addr_send_ack(msk);
- entry->retrans_times++;
- }
-
- if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
- sk_reset_timer(sk, timer,
- jiffies + mptcp_get_add_addr_timeout(sock_net(sk)));
-
- spin_unlock_bh(&msk->pm.lock);
-
- if (entry->retrans_times == ADD_ADDR_RETRANS_MAX)
- mptcp_pm_subflow_established(msk);
-
-out:
- __sock_put(sk);
-}
-
-struct mptcp_pm_add_entry *
-mptcp_pm_del_add_timer(struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr, bool check_id)
-{
- struct mptcp_pm_add_entry *entry;
- struct sock *sk = (struct sock *)msk;
-
- spin_lock_bh(&msk->pm.lock);
- entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
- if (entry && (!check_id || entry->addr.id == addr->id))
- entry->retrans_times = ADD_ADDR_RETRANS_MAX;
- spin_unlock_bh(&msk->pm.lock);
-
- if (entry && (!check_id || entry->addr.id == addr->id))
- sk_stop_timer_sync(sk, &entry->add_timer);
-
- return entry;
-}
-
-bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
- const struct mptcp_pm_addr_entry *entry)
-{
- struct mptcp_pm_add_entry *add_entry = NULL;
- struct sock *sk = (struct sock *)msk;
- struct net *net = sock_net(sk);
-
- lockdep_assert_held(&msk->pm.lock);
-
- add_entry = mptcp_lookup_anno_list_by_saddr(msk, &entry->addr);
-
- if (add_entry) {
- if (mptcp_pm_is_kernel(msk))
- return false;
-
- sk_reset_timer(sk, &add_entry->add_timer,
- jiffies + mptcp_get_add_addr_timeout(net));
- return true;
- }
-
- add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC);
- if (!add_entry)
- return false;
-
- list_add(&add_entry->list, &msk->pm.anno_list);
-
- add_entry->addr = entry->addr;
- add_entry->sock = msk;
- add_entry->retrans_times = 0;
-
- timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
- sk_reset_timer(sk, &add_entry->add_timer,
- jiffies + mptcp_get_add_addr_timeout(net));
-
- return true;
-}
-
-void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
-{
- struct mptcp_pm_add_entry *entry, *tmp;
- struct sock *sk = (struct sock *)msk;
- LIST_HEAD(free_list);
-
- pr_debug("msk=%p", msk);
-
- spin_lock_bh(&msk->pm.lock);
- list_splice_init(&msk->pm.anno_list, &free_list);
- spin_unlock_bh(&msk->pm.lock);
-
- list_for_each_entry_safe(entry, tmp, &free_list, list) {
- sk_stop_timer_sync(sk, &entry->add_timer);
- kfree(entry);
- }
-}
-
-static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int nr,
- const struct mptcp_addr_info *addr)
-{
- int i;
-
- for (i = 0; i < nr; i++) {
- if (addrs[i].id == addr->id)
- return true;
- }
-
- return false;
-}
-
-/* Fill all the remote addresses into the array addrs[],
- * and return the array size.
- */
-static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh,
- struct mptcp_addr_info *addrs)
-{
- bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
- struct sock *sk = (struct sock *)msk, *ssk;
- struct mptcp_subflow_context *subflow;
- struct mptcp_addr_info remote = { 0 };
- unsigned int subflows_max;
- int i = 0;
-
- subflows_max = mptcp_pm_get_subflows_max(msk);
- remote_address((struct sock_common *)sk, &remote);
-
- /* Non-fullmesh endpoint, fill in the single entry
- * corresponding to the primary MPC subflow remote address
- */
- if (!fullmesh) {
- if (deny_id0)
- return 0;
-
- msk->pm.subflows++;
- addrs[i++] = remote;
- } else {
- mptcp_for_each_subflow(msk, subflow) {
- ssk = mptcp_subflow_tcp_sock(subflow);
- remote_address((struct sock_common *)ssk, &addrs[i]);
- addrs[i].id = subflow->remote_id;
- if (deny_id0 && !addrs[i].id)
- continue;
-
- if (!lookup_address_in_vec(addrs, i, &addrs[i]) &&
- msk->pm.subflows < subflows_max) {
- msk->pm.subflows++;
- i++;
- }
- }
- }
-
- return i;
-}
-
-static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
- bool prio, bool backup)
-{
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- bool slow;
-
- pr_debug("send ack for %s",
- prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"));
-
- slow = lock_sock_fast(ssk);
- if (prio) {
- if (subflow->backup != backup)
- msk->last_snd = NULL;
-
- subflow->send_mp_prio = 1;
- subflow->backup = backup;
- subflow->request_bkup = backup;
- }
-
- __mptcp_subflow_send_ack(ssk);
- unlock_sock_fast(ssk, slow);
-}
-
-static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
- bool prio, bool backup)
-{
- spin_unlock_bh(&msk->pm.lock);
- __mptcp_pm_send_ack(msk, subflow, prio, backup);
- spin_lock_bh(&msk->pm.lock);
-}
-
-static struct mptcp_pm_addr_entry *
-__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
-{
- struct mptcp_pm_addr_entry *entry;
-
- list_for_each_entry(entry, &pernet->local_addr_list, list) {
- if (entry->addr.id == id)
- return entry;
- }
- return NULL;
-}
-
-static struct mptcp_pm_addr_entry *
-__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info,
- bool lookup_by_id)
-{
- struct mptcp_pm_addr_entry *entry;
-
- list_for_each_entry(entry, &pernet->local_addr_list, list) {
- if ((!lookup_by_id &&
- mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) ||
- (lookup_by_id && entry->addr.id == info->id))
- return entry;
- }
- return NULL;
-}
-
-static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
-{
- struct sock *sk = (struct sock *)msk;
- struct mptcp_pm_addr_entry *local;
- unsigned int add_addr_signal_max;
- unsigned int local_addr_max;
- struct pm_nl_pernet *pernet;
- unsigned int subflows_max;
-
- pernet = pm_nl_get_pernet(sock_net(sk));
-
- add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk);
- local_addr_max = mptcp_pm_get_local_addr_max(msk);
- subflows_max = mptcp_pm_get_subflows_max(msk);
-
- /* do lazy endpoint usage accounting for the MPC subflows */
- if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) {
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first);
- struct mptcp_pm_addr_entry *entry;
- struct mptcp_addr_info mpc_addr;
- bool backup = false;
-
- local_address((struct sock_common *)msk->first, &mpc_addr);
- rcu_read_lock();
- entry = __lookup_addr(pernet, &mpc_addr, false);
- if (entry) {
- __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
- msk->mpc_endpoint_id = entry->addr.id;
- backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
- }
- rcu_read_unlock();
-
- if (backup)
- mptcp_pm_send_ack(msk, subflow, true, backup);
-
- msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
- }
-
- pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
- msk->pm.local_addr_used, local_addr_max,
- msk->pm.add_addr_signaled, add_addr_signal_max,
- msk->pm.subflows, subflows_max);
-
- /* check first for announce */
- if (msk->pm.add_addr_signaled < add_addr_signal_max) {
- local = select_signal_address(pernet, msk);
-
- /* due to racing events on both ends we can reach here while
- * previous add address is still running: if we invoke now
- * mptcp_pm_announce_addr(), that will fail and the
- * corresponding id will be marked as used.
- * Instead let the PM machinery reschedule us when the
- * current address announce will be completed.
- */
- if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL))
- return;
-
- if (local) {
- if (mptcp_pm_alloc_anno_list(msk, local)) {
- __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
- msk->pm.add_addr_signaled++;
- mptcp_pm_announce_addr(msk, &local->addr, false);
- mptcp_pm_nl_addr_send_ack(msk);
- }
- }
- }
-
- /* check if should create a new subflow */
- while (msk->pm.local_addr_used < local_addr_max &&
- msk->pm.subflows < subflows_max) {
- struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
- bool fullmesh;
- int i, nr;
-
- local = select_local_address(pernet, msk);
- if (!local)
- break;
-
- fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
-
- msk->pm.local_addr_used++;
- nr = fill_remote_addresses_vec(msk, fullmesh, addrs);
- if (nr)
- __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
- spin_unlock_bh(&msk->pm.lock);
- for (i = 0; i < nr; i++)
- __mptcp_subflow_connect(sk, &local->addr, &addrs[i]);
- spin_lock_bh(&msk->pm.lock);
- }
- mptcp_pm_nl_check_work_pending(msk);
-}
-
-static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
-{
- mptcp_pm_create_subflow_or_signal_addr(msk);
-}
-
-static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
-{
- mptcp_pm_create_subflow_or_signal_addr(msk);
-}
-
-/* Fill all the local addresses into the array addrs[],
- * and return the array size.
- */
-static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
- struct mptcp_addr_info *addrs)
-{
- struct sock *sk = (struct sock *)msk;
- struct mptcp_pm_addr_entry *entry;
- struct mptcp_addr_info local;
- struct pm_nl_pernet *pernet;
- unsigned int subflows_max;
- int i = 0;
-
- pernet = pm_nl_get_pernet_from_msk(msk);
- subflows_max = mptcp_pm_get_subflows_max(msk);
-
- rcu_read_lock();
- list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
- continue;
-
- if (entry->addr.family != sk->sk_family) {
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if ((entry->addr.family == AF_INET &&
- !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) ||
- (sk->sk_family == AF_INET &&
- !ipv6_addr_v4mapped(&entry->addr.addr6)))
-#endif
- continue;
- }
-
- if (msk->pm.subflows < subflows_max) {
- msk->pm.subflows++;
- addrs[i++] = entry->addr;
- }
- }
- rcu_read_unlock();
-
- /* If the array is empty, fill in the single
- * 'IPADDRANY' local address
- */
- if (!i) {
- memset(&local, 0, sizeof(local));
- local.family = msk->pm.remote.family;
-
- msk->pm.subflows++;
- addrs[i++] = local;
- }
-
- return i;
-}
-
-static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
-{
- struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
- struct sock *sk = (struct sock *)msk;
- unsigned int add_addr_accept_max;
- struct mptcp_addr_info remote;
- unsigned int subflows_max;
- int i, nr;
-
- add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk);
- subflows_max = mptcp_pm_get_subflows_max(msk);
-
- pr_debug("accepted %d:%d remote family %d",
- msk->pm.add_addr_accepted, add_addr_accept_max,
- msk->pm.remote.family);
-
- remote = msk->pm.remote;
- mptcp_pm_announce_addr(msk, &remote, true);
- mptcp_pm_nl_addr_send_ack(msk);
-
- if (lookup_subflow_by_daddr(&msk->conn_list, &remote))
- return;
-
- /* pick id 0 port, if none is provided the remote address */
- if (!remote.port)
- remote.port = sk->sk_dport;
-
- /* connect to the specified remote address, using whatever
- * local address the routing configuration will pick.
- */
- nr = fill_local_addresses_vec(msk, addrs);
-
- msk->pm.add_addr_accepted++;
- if (msk->pm.add_addr_accepted >= add_addr_accept_max ||
- msk->pm.subflows >= subflows_max)
- WRITE_ONCE(msk->pm.accept_addr, false);
-
- spin_unlock_bh(&msk->pm.lock);
- for (i = 0; i < nr; i++)
- __mptcp_subflow_connect(sk, &addrs[i], &remote);
- spin_lock_bh(&msk->pm.lock);
-}
-
-void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk)
-{
- struct mptcp_subflow_context *subflow;
-
- msk_owned_by_me(msk);
- lockdep_assert_held(&msk->pm.lock);
-
- if (!mptcp_pm_should_add_signal(msk) &&
- !mptcp_pm_should_rm_signal(msk))
- return;
-
- subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node);
- if (subflow)
- mptcp_pm_send_ack(msk, subflow, false, false);
-}
-
-int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr,
- struct mptcp_addr_info *rem,
- u8 bkup)
-{
- struct mptcp_subflow_context *subflow;
-
- pr_debug("bkup=%d", bkup);
-
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- struct mptcp_addr_info local, remote;
-
- local_address((struct sock_common *)ssk, &local);
- if (!mptcp_addresses_equal(&local, addr, addr->port))
- continue;
-
- if (rem && rem->family != AF_UNSPEC) {
- remote_address((struct sock_common *)ssk, &remote);
- if (!mptcp_addresses_equal(&remote, rem, rem->port))
- continue;
- }
-
- __mptcp_pm_send_ack(msk, subflow, true, bkup);
- return 0;
- }
-
- return -EINVAL;
-}
-
-static bool mptcp_local_id_match(const struct mptcp_sock *msk, u8 local_id, u8 id)
-{
- return local_id == id || (!local_id && msk->mpc_endpoint_id == id);
-}
-
-static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
- const struct mptcp_rm_list *rm_list,
- enum linux_mptcp_mib_field rm_type)
-{
- struct mptcp_subflow_context *subflow, *tmp;
- struct sock *sk = (struct sock *)msk;
- u8 i;
-
- pr_debug("%s rm_list_nr %d",
- rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr);
-
- msk_owned_by_me(msk);
-
- if (sk->sk_state == TCP_LISTEN)
- return;
-
- if (!rm_list->nr)
- return;
-
- if (list_empty(&msk->conn_list))
- return;
-
- for (i = 0; i < rm_list->nr; i++) {
- u8 rm_id = rm_list->ids[i];
- bool removed = false;
-
- mptcp_for_each_subflow_safe(msk, subflow, tmp) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
- u8 id = subflow->local_id;
-
- if (rm_type == MPTCP_MIB_RMADDR && subflow->remote_id != rm_id)
- continue;
- if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id))
- continue;
-
- pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u",
- rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow",
- i, rm_id, subflow->local_id, subflow->remote_id,
- msk->mpc_endpoint_id);
- spin_unlock_bh(&msk->pm.lock);
- mptcp_subflow_shutdown(sk, ssk, how);
-
- /* the following takes care of updating the subflows counter */
- mptcp_close_ssk(sk, ssk, subflow);
- spin_lock_bh(&msk->pm.lock);
-
- removed = true;
- __MPTCP_INC_STATS(sock_net(sk), rm_type);
- }
- if (rm_type == MPTCP_MIB_RMSUBFLOW)
- __set_bit(rm_id ? rm_id : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap);
- if (!removed)
- continue;
-
- if (!mptcp_pm_is_kernel(msk))
- continue;
-
- if (rm_type == MPTCP_MIB_RMADDR) {
- msk->pm.add_addr_accepted--;
- WRITE_ONCE(msk->pm.accept_addr, true);
- } else if (rm_type == MPTCP_MIB_RMSUBFLOW) {
- msk->pm.local_addr_used--;
- }
- }
-}
-
-static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
-{
- mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR);
-}
-
-void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
- const struct mptcp_rm_list *rm_list)
-{
- mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW);
-}
-
-void mptcp_pm_nl_work(struct mptcp_sock *msk)
-{
- struct mptcp_pm_data *pm = &msk->pm;
-
- msk_owned_by_me(msk);
-
- if (!(pm->status & MPTCP_PM_WORK_MASK))
- return;
-
- spin_lock_bh(&msk->pm.lock);
-
- pr_debug("msk=%p status=%x", msk, pm->status);
- if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
- pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
- mptcp_pm_nl_add_addr_received(msk);
- }
- if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
- pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
- mptcp_pm_nl_addr_send_ack(msk);
- }
- if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
- pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
- mptcp_pm_nl_rm_addr_received(msk);
- }
- if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
- pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
- mptcp_pm_nl_fully_established(msk);
- }
- if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
- pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
- mptcp_pm_nl_subflow_established(msk);
- }
-
- spin_unlock_bh(&msk->pm.lock);
-}
-
-static bool address_use_port(struct mptcp_pm_addr_entry *entry)
-{
- return (entry->flags &
- (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
- MPTCP_PM_ADDR_FLAG_SIGNAL;
-}
-
-/* caller must ensure the RCU grace period is already elapsed */
-static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
-{
- if (entry->lsk)
- sock_release(entry->lsk);
- kfree(entry);
-}
-
-static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
- struct mptcp_pm_addr_entry *entry)
-{
- struct mptcp_pm_addr_entry *cur, *del_entry = NULL;
- unsigned int addr_max;
- int ret = -EINVAL;
-
- spin_lock_bh(&pernet->lock);
- /* to keep the code simple, don't do IDR-like allocation for address ID,
- * just bail when we exceed limits
- */
- if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
- pernet->next_id = 1;
- if (pernet->addrs >= MPTCP_PM_ADDR_MAX) {
- ret = -ERANGE;
- goto out;
- }
- if (test_bit(entry->addr.id, pernet->id_bitmap)) {
- ret = -EBUSY;
- goto out;
- }
-
- /* do not insert duplicate address, differentiate on port only
- * singled addresses
- */
- if (!address_use_port(entry))
- entry->addr.port = 0;
- list_for_each_entry(cur, &pernet->local_addr_list, list) {
- if (mptcp_addresses_equal(&cur->addr, &entry->addr,
- cur->addr.port || entry->addr.port)) {
- /* allow replacing the exiting endpoint only if such
- * endpoint is an implicit one and the user-space
- * did not provide an endpoint id
- */
- if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) {
- ret = -EEXIST;
- goto out;
- }
- if (entry->addr.id)
- goto out;
-
- pernet->addrs--;
- entry->addr.id = cur->addr.id;
- list_del_rcu(&cur->list);
- del_entry = cur;
- break;
- }
- }
-
- if (!entry->addr.id) {
-find_next:
- entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
- MPTCP_PM_MAX_ADDR_ID + 1,
- pernet->next_id);
- if (!entry->addr.id && pernet->next_id != 1) {
- pernet->next_id = 1;
- goto find_next;
- }
- }
-
- if (!entry->addr.id)
- goto out;
-
- __set_bit(entry->addr.id, pernet->id_bitmap);
- if (entry->addr.id > pernet->next_id)
- pernet->next_id = entry->addr.id;
-
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
- addr_max = pernet->add_addr_signal_max;
- WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1);
- }
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
- addr_max = pernet->local_addr_max;
- WRITE_ONCE(pernet->local_addr_max, addr_max + 1);
- }
-
- pernet->addrs++;
- if (!entry->addr.port)
- list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
- else
- list_add_rcu(&entry->list, &pernet->local_addr_list);
- ret = entry->addr.id;
-
-out:
- spin_unlock_bh(&pernet->lock);
-
- /* just replaced an existing entry, free it */
- if (del_entry) {
- synchronize_rcu();
- __mptcp_pm_release_addr_entry(del_entry);
- }
- return ret;
-}
-
-static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
- struct mptcp_pm_addr_entry *entry)
-{
- int addrlen = sizeof(struct sockaddr_in);
- struct sockaddr_storage addr;
- struct mptcp_sock *msk;
- struct socket *ssock;
- int backlog = 1024;
- int err;
-
- err = sock_create_kern(sock_net(sk), entry->addr.family,
- SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk);
- if (err)
- return err;
-
- msk = mptcp_sk(entry->lsk->sk);
- if (!msk)
- return -EINVAL;
-
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock)
- return -EINVAL;
-
- mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if (entry->addr.family == AF_INET6)
- addrlen = sizeof(struct sockaddr_in6);
-#endif
- err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen);
- if (err)
- return err;
-
- err = kernel_listen(ssock, backlog);
- if (err)
- return err;
-
- mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED);
-
- return 0;
-}
-
-int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
-{
- struct mptcp_pm_addr_entry *entry;
- struct mptcp_addr_info skc_local;
- struct mptcp_addr_info msk_local;
- struct pm_nl_pernet *pernet;
- int ret = -1;
-
- if (WARN_ON_ONCE(!msk))
- return -1;
-
- /* The 0 ID mapping is defined by the first subflow, copied into the msk
- * addr
- */
- local_address((struct sock_common *)msk, &msk_local);
- local_address((struct sock_common *)skc, &skc_local);
- if (mptcp_addresses_equal(&msk_local, &skc_local, false))
- return 0;
-
- if (mptcp_pm_is_userspace(msk))
- return mptcp_userspace_pm_get_local_id(msk, &skc_local);
-
- pernet = pm_nl_get_pernet_from_msk(msk);
-
- rcu_read_lock();
- list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (mptcp_addresses_equal(&entry->addr, &skc_local, entry->addr.port)) {
- ret = entry->addr.id;
- break;
- }
- }
- rcu_read_unlock();
- if (ret >= 0)
- return ret;
-
- /* address not found, add to local list */
- entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
- if (!entry)
- return -ENOMEM;
-
- entry->addr = skc_local;
- entry->addr.id = 0;
- entry->addr.port = 0;
- entry->ifindex = 0;
- entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
- entry->lsk = NULL;
- ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
- if (ret < 0)
- kfree(entry);
-
- return ret;
-}
+#include "mptcp_pm_gen.h"
#define MPTCP_PM_CMD_GRP_OFFSET 0
#define MPTCP_PM_EV_GRP_OFFSET 1
@@ -1093,70 +15,10 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
static const struct genl_multicast_group mptcp_pm_mcgrps[] = {
[MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, },
[MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME,
- .flags = GENL_UNS_ADMIN_PERM,
+ .flags = GENL_MCAST_CAP_NET_ADMIN,
},
};
-static const struct nla_policy
-mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = {
- [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
- [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
- [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
- [MPTCP_PM_ADDR_ATTR_ADDR6] =
- NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
- [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 },
- [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 },
- [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 },
-};
-
-static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
- [MPTCP_PM_ATTR_ADDR] =
- NLA_POLICY_NESTED(mptcp_pm_addr_policy),
- [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, },
- [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
- [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
- [MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, },
- [MPTCP_PM_ATTR_ADDR_REMOTE] =
- NLA_POLICY_NESTED(mptcp_pm_addr_policy),
-};
-
-void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
-{
- struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk);
- struct sock *sk = (struct sock *)msk;
- unsigned int active_max_loss_cnt;
- struct net *net = sock_net(sk);
- unsigned int stale_loss_cnt;
- bool slow;
-
- stale_loss_cnt = mptcp_stale_loss_cnt(net);
- if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt)
- return;
-
- /* look for another available subflow not in loss state */
- active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1);
- mptcp_for_each_subflow(msk, iter) {
- if (iter != subflow && mptcp_subflow_active(iter) &&
- iter->stale_count < active_max_loss_cnt) {
- /* we have some alternatives, try to mark this subflow as idle ...*/
- slow = lock_sock_fast(ssk);
- if (!tcp_rtx_and_write_queues_empty(ssk)) {
- subflow->stale = 1;
- __mptcp_retransmit_pending_data(sk);
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_SUBFLOWSTALE);
- }
- unlock_sock_fast(ssk, slow);
-
- /* always try to push the pending data regardless of re-injections:
- * we can possibly use backup subflows now, and subflow selection
- * is cheap under the msk socket lock
- */
- __mptcp_push_pending(sk, 0);
- return;
- }
- }
-}
-
static int mptcp_pm_family_to_addr(int family)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -1181,7 +43,7 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[],
/* no validation needed - was already done via nested policy */
err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
- mptcp_pm_addr_policy, info->extack);
+ mptcp_pm_address_nl_policy, info->extack);
if (err)
return err;
@@ -1251,7 +113,7 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
return err;
if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) {
- u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+ s32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
entry->ifindex = val;
}
@@ -1265,363 +127,6 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
return 0;
}
-static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
-{
- return pm_nl_get_pernet(genl_info_net(info));
-}
-
-static int mptcp_nl_add_subflow_or_signal_addr(struct net *net)
-{
- struct mptcp_sock *msk;
- long s_slot = 0, s_num = 0;
-
- while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
- struct sock *sk = (struct sock *)msk;
-
- if (!READ_ONCE(msk->fully_established) ||
- mptcp_pm_is_userspace(msk))
- goto next;
-
- lock_sock(sk);
- spin_lock_bh(&msk->pm.lock);
- mptcp_pm_create_subflow_or_signal_addr(msk);
- spin_unlock_bh(&msk->pm.lock);
- release_sock(sk);
-
-next:
- sock_put(sk);
- cond_resched();
- }
-
- return 0;
-}
-
-static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
-{
- struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
- struct mptcp_pm_addr_entry addr, *entry;
- int ret;
-
- ret = mptcp_pm_parse_entry(attr, info, true, &addr);
- if (ret < 0)
- return ret;
-
- if (addr.addr.port && !(addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
- GENL_SET_ERR_MSG(info, "flags must have signal when using port");
- return -EINVAL;
- }
-
- if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL &&
- addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) {
- GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh");
- return -EINVAL;
- }
-
- if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) {
- GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint");
- return -EINVAL;
- }
-
- entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT);
- if (!entry) {
- GENL_SET_ERR_MSG(info, "can't allocate addr");
- return -ENOMEM;
- }
-
- *entry = addr;
- if (entry->addr.port) {
- ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
- if (ret) {
- GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret);
- goto out_free;
- }
- }
- ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
- if (ret < 0) {
- GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret);
- goto out_free;
- }
-
- mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk));
- return 0;
-
-out_free:
- __mptcp_pm_release_addr_entry(entry);
- return ret;
-}
-
-int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
- u8 *flags, int *ifindex)
-{
- struct mptcp_pm_addr_entry *entry;
- struct sock *sk = (struct sock *)msk;
- struct net *net = sock_net(sk);
-
- *flags = 0;
- *ifindex = 0;
-
- if (id) {
- if (mptcp_pm_is_userspace(msk))
- return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk,
- id,
- flags,
- ifindex);
-
- rcu_read_lock();
- entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id);
- if (entry) {
- *flags = entry->flags;
- *ifindex = entry->ifindex;
- }
- rcu_read_unlock();
- }
-
- return 0;
-}
-
-static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr)
-{
- struct mptcp_pm_add_entry *entry;
-
- entry = mptcp_pm_del_add_timer(msk, addr, false);
- if (entry) {
- list_del(&entry->list);
- kfree(entry);
- return true;
- }
-
- return false;
-}
-
-static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr,
- bool force)
-{
- struct mptcp_rm_list list = { .nr = 0 };
- bool ret;
-
- list.ids[list.nr++] = addr->id;
-
- ret = remove_anno_list_by_saddr(msk, addr);
- if (ret || force) {
- spin_lock_bh(&msk->pm.lock);
- mptcp_pm_remove_addr(msk, &list);
- spin_unlock_bh(&msk->pm.lock);
- }
- return ret;
-}
-
-static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
- const struct mptcp_pm_addr_entry *entry)
-{
- const struct mptcp_addr_info *addr = &entry->addr;
- struct mptcp_rm_list list = { .nr = 0 };
- long s_slot = 0, s_num = 0;
- struct mptcp_sock *msk;
-
- pr_debug("remove_id=%d", addr->id);
-
- list.ids[list.nr++] = addr->id;
-
- while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
- struct sock *sk = (struct sock *)msk;
- bool remove_subflow;
-
- if (mptcp_pm_is_userspace(msk))
- goto next;
-
- if (list_empty(&msk->conn_list)) {
- mptcp_pm_remove_anno_addr(msk, addr, false);
- goto next;
- }
-
- lock_sock(sk);
- remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr);
- mptcp_pm_remove_anno_addr(msk, addr, remove_subflow &&
- !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT));
- if (remove_subflow)
- mptcp_pm_remove_subflow(msk, &list);
- release_sock(sk);
-
-next:
- sock_put(sk);
- cond_resched();
- }
-
- return 0;
-}
-
-static int mptcp_nl_remove_id_zero_address(struct net *net,
- struct mptcp_addr_info *addr)
-{
- struct mptcp_rm_list list = { .nr = 0 };
- long s_slot = 0, s_num = 0;
- struct mptcp_sock *msk;
-
- list.ids[list.nr++] = 0;
-
- while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
- struct sock *sk = (struct sock *)msk;
- struct mptcp_addr_info msk_local;
-
- if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
- goto next;
-
- local_address((struct sock_common *)msk, &msk_local);
- if (!mptcp_addresses_equal(&msk_local, addr, addr->port))
- goto next;
-
- lock_sock(sk);
- spin_lock_bh(&msk->pm.lock);
- mptcp_pm_remove_addr(msk, &list);
- mptcp_pm_nl_rm_subflow_received(msk, &list);
- spin_unlock_bh(&msk->pm.lock);
- release_sock(sk);
-
-next:
- sock_put(sk);
- cond_resched();
- }
-
- return 0;
-}
-
-static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
-{
- struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
- struct mptcp_pm_addr_entry addr, *entry;
- unsigned int addr_max;
- int ret;
-
- ret = mptcp_pm_parse_entry(attr, info, false, &addr);
- if (ret < 0)
- return ret;
-
- /* the zero id address is special: the first address used by the msk
- * always gets such an id, so different subflows can have different zero
- * id addresses. Additionally zero id is not accounted for in id_bitmap.
- * Let's use an 'mptcp_rm_list' instead of the common remove code.
- */
- if (addr.addr.id == 0)
- return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr);
-
- spin_lock_bh(&pernet->lock);
- entry = __lookup_addr_by_id(pernet, addr.addr.id);
- if (!entry) {
- GENL_SET_ERR_MSG(info, "address not found");
- spin_unlock_bh(&pernet->lock);
- return -EINVAL;
- }
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
- addr_max = pernet->add_addr_signal_max;
- WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1);
- }
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
- addr_max = pernet->local_addr_max;
- WRITE_ONCE(pernet->local_addr_max, addr_max - 1);
- }
-
- pernet->addrs--;
- list_del_rcu(&entry->list);
- __clear_bit(entry->addr.id, pernet->id_bitmap);
- spin_unlock_bh(&pernet->lock);
-
- mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry);
- synchronize_rcu();
- __mptcp_pm_release_addr_entry(entry);
-
- return ret;
-}
-
-void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
- struct list_head *rm_list)
-{
- struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 };
- struct mptcp_pm_addr_entry *entry;
-
- list_for_each_entry(entry, rm_list, list) {
- if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
- slist.nr < MPTCP_RM_IDS_MAX)
- slist.ids[slist.nr++] = entry->addr.id;
-
- if (remove_anno_list_by_saddr(msk, &entry->addr) &&
- alist.nr < MPTCP_RM_IDS_MAX)
- alist.ids[alist.nr++] = entry->addr.id;
- }
-
- if (alist.nr) {
- spin_lock_bh(&msk->pm.lock);
- mptcp_pm_remove_addr(msk, &alist);
- spin_unlock_bh(&msk->pm.lock);
- }
- if (slist.nr)
- mptcp_pm_remove_subflow(msk, &slist);
-}
-
-static void mptcp_nl_remove_addrs_list(struct net *net,
- struct list_head *rm_list)
-{
- long s_slot = 0, s_num = 0;
- struct mptcp_sock *msk;
-
- if (list_empty(rm_list))
- return;
-
- while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
- struct sock *sk = (struct sock *)msk;
-
- if (!mptcp_pm_is_userspace(msk)) {
- lock_sock(sk);
- mptcp_pm_remove_addrs_and_subflows(msk, rm_list);
- release_sock(sk);
- }
-
- sock_put(sk);
- cond_resched();
- }
-}
-
-/* caller must ensure the RCU grace period is already elapsed */
-static void __flush_addrs(struct list_head *list)
-{
- while (!list_empty(list)) {
- struct mptcp_pm_addr_entry *cur;
-
- cur = list_entry(list->next,
- struct mptcp_pm_addr_entry, list);
- list_del_rcu(&cur->list);
- __mptcp_pm_release_addr_entry(cur);
- }
-}
-
-static void __reset_counters(struct pm_nl_pernet *pernet)
-{
- WRITE_ONCE(pernet->add_addr_signal_max, 0);
- WRITE_ONCE(pernet->add_addr_accept_max, 0);
- WRITE_ONCE(pernet->local_addr_max, 0);
- pernet->addrs = 0;
-}
-
-static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
-{
- struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
- LIST_HEAD(free_list);
-
- spin_lock_bh(&pernet->lock);
- list_splice_init(&pernet->local_addr_list, &free_list);
- __reset_counters(pernet);
- pernet->next_id = 1;
- bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
- spin_unlock_bh(&pernet->lock);
- mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list);
- synchronize_rcu();
- __flush_addrs(&free_list);
- return 0;
-}
-
static int mptcp_nl_fill_addr(struct sk_buff *skb,
struct mptcp_pm_addr_entry *entry)
{
@@ -1661,15 +166,26 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info)
+static int mptcp_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr,
+ struct genl_info *info)
+{
+ if (info->attrs[MPTCP_PM_ATTR_TOKEN])
+ return mptcp_userspace_pm_get_addr(id, addr, info);
+ return mptcp_pm_nl_get_addr(id, addr, info);
+}
+
+int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
- struct mptcp_pm_addr_entry addr, *entry;
+ struct mptcp_pm_addr_entry addr;
+ struct nlattr *attr;
struct sk_buff *msg;
void *reply;
int ret;
+ if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR))
+ return -EINVAL;
+
+ attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
ret = mptcp_pm_parse_entry(attr, info, false, &addr);
if (ret < 0)
return ret;
@@ -1686,245 +202,83 @@ static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info)
goto fail;
}
- spin_lock_bh(&pernet->lock);
- entry = __lookup_addr_by_id(pernet, addr.addr.id);
- if (!entry) {
- GENL_SET_ERR_MSG(info, "address not found");
- ret = -EINVAL;
- goto unlock_fail;
+ ret = mptcp_pm_get_addr(addr.addr.id, &addr, info);
+ if (ret) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found");
+ goto fail;
}
- ret = mptcp_nl_fill_addr(msg, entry);
+ ret = mptcp_nl_fill_addr(msg, &addr);
if (ret)
- goto unlock_fail;
+ goto fail;
genlmsg_end(msg, reply);
ret = genlmsg_reply(msg, info);
- spin_unlock_bh(&pernet->lock);
return ret;
-unlock_fail:
- spin_unlock_bh(&pernet->lock);
-
fail:
nlmsg_free(msg);
return ret;
}
-static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
- struct netlink_callback *cb)
+int mptcp_pm_genl_fill_addr(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ struct mptcp_pm_addr_entry *entry)
{
- struct net *net = sock_net(msg->sk);
- struct mptcp_pm_addr_entry *entry;
- struct pm_nl_pernet *pernet;
- int id = cb->args[0];
void *hdr;
- int i;
-
- pernet = pm_nl_get_pernet(net);
-
- spin_lock_bh(&pernet->lock);
- for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
- if (test_bit(i, pernet->id_bitmap)) {
- entry = __lookup_addr_by_id(pernet, i);
- if (!entry)
- break;
-
- if (entry->addr.id <= id)
- continue;
-
- hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, &mptcp_genl_family,
- NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
- if (!hdr)
- break;
-
- if (mptcp_nl_fill_addr(msg, entry) < 0) {
- genlmsg_cancel(msg, hdr);
- break;
- }
-
- id = entry->addr.id;
- genlmsg_end(msg, hdr);
- }
- }
- spin_unlock_bh(&pernet->lock);
-
- cb->args[0] = id;
- return msg->len;
-}
-static int parse_limit(struct genl_info *info, int id, unsigned int *limit)
-{
- struct nlattr *attr = info->attrs[id];
-
- if (!attr)
- return 0;
+ hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &mptcp_genl_family,
+ NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
+ if (!hdr)
+ return -EINVAL;
- *limit = nla_get_u32(attr);
- if (*limit > MPTCP_PM_ADDR_MAX) {
- GENL_SET_ERR_MSG(info, "limit greater than maximum");
+ if (mptcp_nl_fill_addr(msg, entry) < 0) {
+ genlmsg_cancel(msg, hdr);
return -EINVAL;
}
- return 0;
-}
-static int
-mptcp_nl_cmd_set_limits(struct sk_buff *skb, struct genl_info *info)
-{
- struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
- unsigned int rcv_addrs, subflows;
- int ret;
-
- spin_lock_bh(&pernet->lock);
- rcv_addrs = pernet->add_addr_accept_max;
- ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
- if (ret)
- goto unlock;
-
- subflows = pernet->subflows_max;
- ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
- if (ret)
- goto unlock;
-
- WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs);
- WRITE_ONCE(pernet->subflows_max, subflows);
-
-unlock:
- spin_unlock_bh(&pernet->lock);
- return ret;
+ genlmsg_end(msg, hdr);
+ return 0;
}
-static int
-mptcp_nl_cmd_get_limits(struct sk_buff *skb, struct genl_info *info)
+static int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
- struct sk_buff *msg;
- void *reply;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
- MPTCP_PM_CMD_GET_LIMITS);
- if (!reply)
- goto fail;
+ const struct genl_info *info = genl_info_dump(cb);
- if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
- READ_ONCE(pernet->add_addr_accept_max)))
- goto fail;
-
- if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
- READ_ONCE(pernet->subflows_max)))
- goto fail;
-
- genlmsg_end(msg, reply);
- return genlmsg_reply(msg, info);
-
-fail:
- GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
- nlmsg_free(msg);
- return -EMSGSIZE;
+ if (info->attrs[MPTCP_PM_ATTR_TOKEN])
+ return mptcp_userspace_pm_dump_addr(msg, cb);
+ return mptcp_pm_nl_dump_addr(msg, cb);
}
-static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr)
+int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
{
- struct mptcp_rm_list list = { .nr = 0 };
-
- list.ids[list.nr++] = addr->id;
-
- spin_lock_bh(&msk->pm.lock);
- mptcp_pm_nl_rm_subflow_received(msk, &list);
- mptcp_pm_create_subflow_or_signal_addr(msk);
- spin_unlock_bh(&msk->pm.lock);
+ return mptcp_pm_dump_addr(msg, cb);
}
-static int mptcp_nl_set_flags(struct net *net,
- struct mptcp_addr_info *addr,
- u8 bkup, u8 changed)
+static int mptcp_pm_set_flags(struct genl_info *info)
{
- long s_slot = 0, s_num = 0;
- struct mptcp_sock *msk;
+ struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, };
+ struct nlattr *attr_loc;
int ret = -EINVAL;
- while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
- struct sock *sk = (struct sock *)msk;
-
- if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
- goto next;
-
- lock_sock(sk);
- if (changed & MPTCP_PM_ADDR_FLAG_BACKUP)
- ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup);
- if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)
- mptcp_pm_nl_fullmesh(msk, addr);
- release_sock(sk);
-
-next:
- sock_put(sk);
- cond_resched();
- }
-
- return ret;
-}
-
-static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
-{
- struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }, *entry;
- struct mptcp_pm_addr_entry remote = { .addr = { .family = AF_UNSPEC }, };
- struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
- struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
- u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP |
- MPTCP_PM_ADDR_FLAG_FULLMESH;
- struct net *net = sock_net(skb->sk);
- u8 bkup = 0, lookup_by_id = 0;
- int ret;
+ if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR))
+ return ret;
- ret = mptcp_pm_parse_entry(attr, info, false, &addr);
+ attr_loc = info->attrs[MPTCP_PM_ATTR_ADDR];
+ ret = mptcp_pm_parse_entry(attr_loc, info, false, &loc);
if (ret < 0)
return ret;
- if (attr_rem) {
- ret = mptcp_pm_parse_entry(attr_rem, info, false, &remote);
- if (ret < 0)
- return ret;
- }
-
- if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
- bkup = 1;
- if (addr.addr.family == AF_UNSPEC) {
- lookup_by_id = 1;
- if (!addr.addr.id)
- return -EOPNOTSUPP;
- }
-
- if (token)
- return mptcp_userspace_pm_set_flags(sock_net(skb->sk),
- token, &addr, &remote, bkup);
-
- spin_lock_bh(&pernet->lock);
- entry = __lookup_addr(pernet, &addr.addr, lookup_by_id);
- if (!entry) {
- spin_unlock_bh(&pernet->lock);
- return -EINVAL;
- }
- if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
- (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
- spin_unlock_bh(&pernet->lock);
- return -EINVAL;
- }
-
- changed = (addr.flags ^ entry->flags) & mask;
- entry->flags = (entry->flags & ~mask) | (addr.flags & mask);
- addr = *entry;
- spin_unlock_bh(&pernet->lock);
+ if (info->attrs[MPTCP_PM_ATTR_TOKEN])
+ return mptcp_userspace_pm_set_flags(&loc, info);
+ return mptcp_pm_nl_set_flags(&loc, info);
+}
- mptcp_nl_set_flags(net, &addr.addr, bkup, changed);
- return 0;
+int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ return mptcp_pm_set_flags(info);
}
static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp)
@@ -1957,9 +311,7 @@ static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk)
break;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
case AF_INET6: {
- const struct ipv6_pinfo *np = inet6_sk(ssk);
-
- if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr))
+ if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr))
return -EMSGSIZE;
if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr))
return -EMSGSIZE;
@@ -1980,7 +332,7 @@ static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk)
if (WARN_ON_ONCE(!sf))
return -EINVAL;
- if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, sf->local_id))
+ if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, subflow_get_local_id(sf)))
return -EMSGSIZE;
if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id))
@@ -1997,7 +349,7 @@ static int mptcp_event_put_token_and_ssk(struct sk_buff *skb,
const struct mptcp_subflow_context *sf;
u8 sk_err;
- if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)))
return -EMSGSIZE;
if (mptcp_event_add_subflow(skb, ssk))
@@ -2014,7 +366,7 @@ static int mptcp_event_put_token_and_ssk(struct sk_buff *skb,
nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if))
return -EMSGSIZE;
- sk_err = ssk->sk_err;
+ sk_err = READ_ONCE(ssk->sk_err);
if (sk_err && sk->sk_state == TCP_ESTABLISHED &&
nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err))
return -EMSGSIZE;
@@ -2055,12 +407,24 @@ static int mptcp_event_created(struct sk_buff *skb,
const struct mptcp_sock *msk,
const struct sock *ssk)
{
- int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token);
+ int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token));
+ u16 flags = 0;
if (err)
return err;
- if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side)))
+ if (READ_ONCE(msk->pm.server_side)) {
+ flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE;
+
+ /* Deprecated, and only set when it is the server side */
+ if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1))
+ return -EMSGSIZE;
+ }
+
+ if (READ_ONCE(msk->pm.remote_deny_join_id0))
+ flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0;
+
+ if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags))
return -EMSGSIZE;
return mptcp_event_add_subflow(skb, ssk);
@@ -2083,7 +447,7 @@ void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id)
if (!nlh)
goto nla_put_failure;
- if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)))
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id))
@@ -2118,7 +482,7 @@ void mptcp_event_addr_announced(const struct sock *ssk,
if (!nlh)
goto nla_put_failure;
- if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)))
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id))
@@ -2186,9 +550,7 @@ void mptcp_event_pm_listener(const struct sock *ssk,
break;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
case AF_INET6: {
- const struct ipv6_pinfo *np = inet6_sk(ssk);
-
- if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr))
+ if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr))
goto nla_put_failure;
break;
}
@@ -2234,7 +596,7 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
goto nla_put_failure;
break;
case MPTCP_EVENT_CLOSED:
- if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token) < 0)
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)) < 0)
goto nla_put_failure;
break;
case MPTCP_EVENT_ANNOUNCED:
@@ -2264,123 +626,20 @@ nla_put_failure:
nlmsg_free(skb);
}
-static const struct genl_small_ops mptcp_pm_ops[] = {
- {
- .cmd = MPTCP_PM_CMD_ADD_ADDR,
- .doit = mptcp_nl_cmd_add_addr,
- .flags = GENL_UNS_ADMIN_PERM,
- },
- {
- .cmd = MPTCP_PM_CMD_DEL_ADDR,
- .doit = mptcp_nl_cmd_del_addr,
- .flags = GENL_UNS_ADMIN_PERM,
- },
- {
- .cmd = MPTCP_PM_CMD_FLUSH_ADDRS,
- .doit = mptcp_nl_cmd_flush_addrs,
- .flags = GENL_UNS_ADMIN_PERM,
- },
- {
- .cmd = MPTCP_PM_CMD_GET_ADDR,
- .doit = mptcp_nl_cmd_get_addr,
- .dumpit = mptcp_nl_cmd_dump_addrs,
- },
- {
- .cmd = MPTCP_PM_CMD_SET_LIMITS,
- .doit = mptcp_nl_cmd_set_limits,
- .flags = GENL_UNS_ADMIN_PERM,
- },
- {
- .cmd = MPTCP_PM_CMD_GET_LIMITS,
- .doit = mptcp_nl_cmd_get_limits,
- },
- {
- .cmd = MPTCP_PM_CMD_SET_FLAGS,
- .doit = mptcp_nl_cmd_set_flags,
- .flags = GENL_UNS_ADMIN_PERM,
- },
- {
- .cmd = MPTCP_PM_CMD_ANNOUNCE,
- .doit = mptcp_nl_cmd_announce,
- .flags = GENL_UNS_ADMIN_PERM,
- },
- {
- .cmd = MPTCP_PM_CMD_REMOVE,
- .doit = mptcp_nl_cmd_remove,
- .flags = GENL_UNS_ADMIN_PERM,
- },
- {
- .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE,
- .doit = mptcp_nl_cmd_sf_create,
- .flags = GENL_UNS_ADMIN_PERM,
- },
- {
- .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY,
- .doit = mptcp_nl_cmd_sf_destroy,
- .flags = GENL_UNS_ADMIN_PERM,
- },
-};
-
-static struct genl_family mptcp_genl_family __ro_after_init = {
+struct genl_family mptcp_genl_family __ro_after_init = {
.name = MPTCP_PM_NAME,
.version = MPTCP_PM_VER,
- .maxattr = MPTCP_PM_ATTR_MAX,
- .policy = mptcp_pm_policy,
.netnsok = true,
.module = THIS_MODULE,
- .small_ops = mptcp_pm_ops,
- .n_small_ops = ARRAY_SIZE(mptcp_pm_ops),
+ .ops = mptcp_pm_nl_ops,
+ .n_ops = ARRAY_SIZE(mptcp_pm_nl_ops),
.resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1,
.mcgrps = mptcp_pm_mcgrps,
.n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
};
-static int __net_init pm_nl_init_net(struct net *net)
-{
- struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
-
- INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
-
- /* Cit. 2 subflows ought to be enough for anybody. */
- pernet->subflows_max = 2;
- pernet->next_id = 1;
- pernet->stale_loss_cnt = 4;
- spin_lock_init(&pernet->lock);
-
- /* No need to initialize other pernet fields, the struct is zeroed at
- * allocation time.
- */
-
- return 0;
-}
-
-static void __net_exit pm_nl_exit_net(struct list_head *net_list)
-{
- struct net *net;
-
- list_for_each_entry(net, net_list, exit_list) {
- struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
-
- /* net is removed from namespace list, can't race with
- * other modifiers, also netns core already waited for a
- * RCU grace period.
- */
- __flush_addrs(&pernet->local_addr_list);
- }
-}
-
-static struct pernet_operations mptcp_pm_pernet_ops = {
- .init = pm_nl_init_net,
- .exit_batch = pm_nl_exit_net,
- .id = &pm_nl_pernet_id,
- .size = sizeof(struct pm_nl_pernet),
-};
-
void __init mptcp_pm_nl_init(void)
{
- if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
- panic("Failed to register MPTCP PM pernet subsystem.\n");
-
if (genl_register_family(&mptcp_genl_family))
panic("Failed to register MPTCP PM netlink family\n");
}
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index ea6ad9da7493..8cbc1920afb4 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -6,16 +6,18 @@
#include "protocol.h"
#include "mib.h"
+#include "mptcp_pm_gen.h"
-void mptcp_free_local_addr_list(struct mptcp_sock *msk)
+#define mptcp_for_each_userspace_pm_addr(__msk, __entry) \
+ list_for_each_entry(__entry, \
+ &((__msk)->pm.userspace_pm_local_addr_list), list)
+
+void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk)
{
struct mptcp_pm_addr_entry *entry, *tmp;
struct sock *sk = (struct sock *)msk;
LIST_HEAD(free_list);
- if (!mptcp_pm_is_userspace(msk))
- return;
-
spin_lock_bh(&msk->pm.lock);
list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list);
spin_unlock_bh(&msk->pm.lock);
@@ -25,11 +27,24 @@ void mptcp_free_local_addr_list(struct mptcp_sock *msk)
}
}
-int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
- struct mptcp_pm_addr_entry *entry)
+static struct mptcp_pm_addr_entry *
+mptcp_userspace_pm_lookup_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ mptcp_for_each_userspace_pm_addr(msk, entry) {
+ if (mptcp_addresses_equal(&entry->addr, addr, false))
+ return entry;
+ }
+ return NULL;
+}
+
+static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry,
+ bool needs_id)
{
DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
- struct mptcp_pm_addr_entry *match = NULL;
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *e;
bool addr_match = false;
@@ -39,276 +54,376 @@ int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) {
+ mptcp_for_each_userspace_pm_addr(msk, e) {
addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true);
- if (addr_match && entry->addr.id == 0)
+ if (addr_match && entry->addr.id == 0 && needs_id)
entry->addr.id = e->addr.id;
id_match = (e->addr.id == entry->addr.id);
- if (addr_match && id_match) {
- match = e;
+ if (addr_match || id_match)
break;
- } else if (addr_match || id_match) {
- break;
- }
__set_bit(e->addr.id, id_bitmap);
}
- if (!match && !addr_match && !id_match) {
+ if (!addr_match && !id_match) {
/* Memory for the entry is allocated from the
* sock option buffer.
*/
- e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC);
+ e = sock_kmemdup(sk, entry, sizeof(*entry), GFP_ATOMIC);
if (!e) {
- spin_unlock_bh(&msk->pm.lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto append_err;
}
- *e = *entry;
- if (!e->addr.id)
+ if (!e->addr.id && needs_id)
e->addr.id = find_next_zero_bit(id_bitmap,
MPTCP_PM_MAX_ADDR_ID + 1,
1);
list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list);
+ msk->pm.local_addr_used++;
ret = e->addr.id;
- } else if (match) {
+ } else if (addr_match && id_match) {
ret = entry->addr.id;
}
+append_err:
spin_unlock_bh(&msk->pm.lock);
return ret;
}
-int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
- unsigned int id,
- u8 *flags, int *ifindex)
+/* If the subflow is closed from the other peer (not via a
+ * subflow destroy command then), we want to keep the entry
+ * not to assign the same ID to another address and to be
+ * able to send RM_ADDR after the removal of the subflow.
+ */
+static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *addr)
{
- struct mptcp_pm_addr_entry *entry, *match = NULL;
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *entry;
- *flags = 0;
- *ifindex = 0;
-
- spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
- if (id == entry->addr.id) {
- match = entry;
- break;
- }
- }
- spin_unlock_bh(&msk->pm.lock);
- if (match) {
- *flags = match->flags;
- *ifindex = match->ifindex;
- }
+ entry = mptcp_userspace_pm_lookup_addr(msk, &addr->addr);
+ if (!entry)
+ return -EINVAL;
+ /* TODO: a refcount is needed because the entry can
+ * be used multiple times (e.g. fullmesh mode).
+ */
+ list_del_rcu(&entry->list);
+ sock_kfree_s(sk, entry, sizeof(*entry));
+ msk->pm.local_addr_used--;
return 0;
}
+static struct mptcp_pm_addr_entry *
+mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ mptcp_for_each_userspace_pm_addr(msk, entry) {
+ if (entry->addr.id == id)
+ return entry;
+ }
+ return NULL;
+}
+
int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk,
- struct mptcp_addr_info *skc)
+ struct mptcp_pm_addr_entry *skc)
{
- struct mptcp_pm_addr_entry new_entry;
__be16 msk_sport = ((struct inet_sock *)
inet_sk((struct sock *)msk))->inet_sport;
+ struct mptcp_pm_addr_entry *entry;
- memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry));
- new_entry.addr = *skc;
- new_entry.addr.id = 0;
- new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
+ spin_lock_bh(&msk->pm.lock);
+ entry = mptcp_userspace_pm_lookup_addr(msk, &skc->addr);
+ spin_unlock_bh(&msk->pm.lock);
+ if (entry)
+ return entry->addr.id;
- if (new_entry.addr.port == msk_sport)
- new_entry.addr.port = 0;
+ if (skc->addr.port == msk_sport)
+ skc->addr.port = 0;
- return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry);
+ return mptcp_userspace_pm_append_new_local_addr(msk, skc, true);
}
-int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info)
+bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk,
+ struct mptcp_addr_info *skc)
{
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
- struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct mptcp_pm_addr_entry addr_val;
- struct mptcp_sock *msk;
- int err = -EINVAL;
- u32 token_val;
+ struct mptcp_pm_addr_entry *entry;
+ bool backup;
- if (!addr || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
- return err;
- }
+ spin_lock_bh(&msk->pm.lock);
+ entry = mptcp_userspace_pm_lookup_addr(msk, skc);
+ backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ spin_unlock_bh(&msk->pm.lock);
+
+ return backup;
+}
- token_val = nla_get_u32(token);
+static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *info)
+{
+ struct mptcp_sock *msk;
+ struct nlattr *token;
+
+ if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_TOKEN))
+ return NULL;
- msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
+ token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ msk = mptcp_token_get_sock(genl_info_net(info), nla_get_u32(token));
if (!msk) {
NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
- return err;
+ return NULL;
}
if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto announce_err;
+ NL_SET_ERR_MSG_ATTR(info->extack, token,
+ "userspace PM not selected");
+ sock_put((struct sock *)msk);
+ return NULL;
}
+ return msk;
+}
+
+int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct mptcp_pm_addr_entry addr_val;
+ struct mptcp_sock *msk;
+ struct nlattr *addr;
+ int err = -EINVAL;
+ struct sock *sk;
+
+ if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR))
+ return err;
+
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
+ return err;
+
+ sk = (struct sock *)msk;
+
+ addr = info->attrs[MPTCP_PM_ATTR_ADDR];
err = mptcp_pm_parse_entry(addr, info, true, &addr_val);
- if (err < 0) {
- GENL_SET_ERR_MSG(info, "error parsing local address");
+ if (err < 0)
+ goto announce_err;
+
+ if (addr_val.addr.id == 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, addr, "invalid addr id");
+ err = -EINVAL;
goto announce_err;
}
- if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
- GENL_SET_ERR_MSG(info, "invalid addr id or flags");
+ if (!(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, addr, "invalid addr flags");
err = -EINVAL;
goto announce_err;
}
- err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val);
+ err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val, false);
if (err < 0) {
- GENL_SET_ERR_MSG(info, "did not match address and id");
+ NL_SET_ERR_MSG_ATTR(info->extack, addr,
+ "did not match address and id");
goto announce_err;
}
- lock_sock((struct sock *)msk);
+ lock_sock(sk);
spin_lock_bh(&msk->pm.lock);
- if (mptcp_pm_alloc_anno_list(msk, &addr_val)) {
+ if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) {
+ msk->pm.add_addr_signaled++;
mptcp_pm_announce_addr(msk, &addr_val.addr, false);
- mptcp_pm_nl_addr_send_ack(msk);
+ mptcp_pm_addr_send_ack(msk);
}
spin_unlock_bh(&msk->pm.lock);
- release_sock((struct sock *)msk);
+ release_sock(sk);
err = 0;
announce_err:
- sock_put((struct sock *)msk);
+ sock_put(sk);
return err;
}
-int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info)
+static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk)
{
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
- struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID];
- struct mptcp_pm_addr_entry *match = NULL;
- struct mptcp_pm_addr_entry *entry;
+ struct mptcp_rm_list list = { .nr = 0 };
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ bool has_id_0 = false;
+ int err = -EINVAL;
+
+ lock_sock(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->local_id) == 0) {
+ has_id_0 = true;
+ break;
+ }
+ }
+ if (!has_id_0)
+ goto remove_err;
+
+ list.ids[list.nr++] = 0;
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_remove_addr(msk, &list);
+ spin_unlock_bh(&msk->pm.lock);
+
+ err = 0;
+
+remove_err:
+ release_sock(sk);
+ return err;
+}
+
+void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_rm_list alist = { .nr = 0 };
+ int anno_nr = 0;
+
+ /* only delete if either announced or matching a subflow */
+ if (mptcp_remove_anno_list_by_saddr(msk, &entry->addr))
+ anno_nr++;
+ else if (!mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr))
+ return;
+
+ alist.ids[alist.nr++] = entry->addr.id;
+
+ spin_lock_bh(&msk->pm.lock);
+ msk->pm.add_addr_signaled -= anno_nr;
+ mptcp_pm_remove_addr(msk, &alist);
+ spin_unlock_bh(&msk->pm.lock);
+}
+
+int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct mptcp_pm_addr_entry *match;
struct mptcp_sock *msk;
- LIST_HEAD(free_list);
+ struct nlattr *id;
int err = -EINVAL;
- u32 token_val;
+ struct sock *sk;
u8 id_val;
- if (!id || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_LOC_ID))
return err;
- }
+ id = info->attrs[MPTCP_PM_ATTR_LOC_ID];
id_val = nla_get_u8(id);
- token_val = nla_get_u32(token);
- msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
-
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto remove_err;
- }
- lock_sock((struct sock *)msk);
+ sk = (struct sock *)msk;
- list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
- if (entry->addr.id == id_val) {
- match = entry;
- break;
- }
+ if (id_val == 0) {
+ err = mptcp_userspace_pm_remove_id_zero_address(msk);
+ goto out;
}
+ lock_sock(sk);
+
+ spin_lock_bh(&msk->pm.lock);
+ match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val);
if (!match) {
- GENL_SET_ERR_MSG(info, "address with specified id not found");
- release_sock((struct sock *)msk);
- goto remove_err;
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+ goto out;
}
- list_move(&match->list, &free_list);
+ list_del_rcu(&match->list);
+ spin_unlock_bh(&msk->pm.lock);
- mptcp_pm_remove_addrs_and_subflows(msk, &free_list);
+ mptcp_pm_remove_addr_entry(msk, match);
- release_sock((struct sock *)msk);
+ release_sock(sk);
- list_for_each_entry_safe(match, entry, &free_list, list) {
- sock_kfree_s((struct sock *)msk, match, sizeof(*match));
- }
+ kfree_rcu_mightsleep(match);
+ /* Adjust sk_omem_alloc like sock_kfree_s() does, to match
+ * with allocation of this memory by sock_kmemdup()
+ */
+ atomic_sub(sizeof(*match), &sk->sk_omem_alloc);
err = 0;
- remove_err:
- sock_put((struct sock *)msk);
+out:
+ if (err)
+ NL_SET_ERR_MSG_ATTR_FMT(info->extack, id,
+ "address with id %u not found",
+ id_val);
+
+ sock_put(sk);
return err;
}
-int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info)
+int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
- struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct mptcp_pm_addr_entry entry = { 0 };
struct mptcp_addr_info addr_r;
- struct mptcp_addr_info addr_l;
+ struct nlattr *raddr, *laddr;
+ struct mptcp_pm_local local;
struct mptcp_sock *msk;
int err = -EINVAL;
struct sock *sk;
- u32 token_val;
- if (!laddr || !raddr || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR) ||
+ GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE))
return err;
- }
- token_val = nla_get_u32(token);
-
- msk = mptcp_token_get_sock(genl_info_net(info), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto create_err;
- }
+ sk = (struct sock *)msk;
- err = mptcp_pm_parse_addr(laddr, info, &addr_l);
- if (err < 0) {
- NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
+ laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ err = mptcp_pm_parse_entry(laddr, info, true, &entry);
+ if (err < 0)
goto create_err;
- }
- if (addr_l.id == 0) {
- NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id");
+ if (entry.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ NL_SET_ERR_MSG_ATTR(info->extack, laddr, "invalid addr flags");
err = -EINVAL;
goto create_err;
}
+ entry.flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW;
+ raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
err = mptcp_pm_parse_addr(raddr, info, &addr_r);
- if (err < 0) {
- NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr");
+ if (err < 0)
goto create_err;
- }
-
- sk = (struct sock *)msk;
- if (!mptcp_pm_addr_families_match(sk, &addr_l, &addr_r)) {
+ if (!mptcp_pm_addr_families_match(sk, &entry.addr, &addr_r)) {
GENL_SET_ERR_MSG(info, "families mismatch");
err = -EINVAL;
goto create_err;
}
- lock_sock(sk);
+ err = mptcp_userspace_pm_append_new_local_addr(msk, &entry, false);
+ if (err < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, laddr,
+ "did not match address and id");
+ goto create_err;
+ }
- err = __mptcp_subflow_connect(sk, &addr_l, &addr_r);
+ local.addr = entry.addr;
+ local.flags = entry.flags;
+ local.ifindex = entry.ifindex;
+ lock_sock(sk);
+ err = __mptcp_subflow_connect(sk, &local, &addr_r);
release_sock(sk);
+ if (err)
+ GENL_SET_ERR_MSG_FMT(info, "connect error: %d", err);
+
+ spin_lock_bh(&msk->pm.lock);
+ if (err)
+ mptcp_userspace_pm_delete_local_addr(msk, &entry);
+ else
+ msk->pm.extra_subflows++;
+ spin_unlock_bh(&msk->pm.lock);
+
create_err:
- sock_put((struct sock *)msk);
+ sock_put(sk);
return err;
}
@@ -340,9 +455,7 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk,
break;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
case AF_INET6: {
- const struct ipv6_pinfo *pinfo = inet6_sk(ssk);
-
- if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) ||
+ if (!ipv6_addr_equal(&local->addr6, &issk->pinet6->saddr) ||
!ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr))
continue;
break;
@@ -360,106 +473,226 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk,
return NULL;
}
-int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info)
+int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
- struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct mptcp_addr_info addr_l;
+ struct mptcp_pm_addr_entry addr_l;
struct mptcp_addr_info addr_r;
+ struct nlattr *raddr, *laddr;
struct mptcp_sock *msk;
struct sock *sk, *ssk;
int err = -EINVAL;
- u32 token_val;
- if (!laddr || !raddr || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR) ||
+ GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE))
return err;
- }
-
- token_val = nla_get_u32(token);
- msk = mptcp_token_get_sock(genl_info_net(info), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto destroy_err;
- }
+ sk = (struct sock *)msk;
- err = mptcp_pm_parse_addr(laddr, info, &addr_l);
- if (err < 0) {
- NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
+ laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ err = mptcp_pm_parse_entry(laddr, info, true, &addr_l);
+ if (err < 0)
goto destroy_err;
- }
+ raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
err = mptcp_pm_parse_addr(raddr, info, &addr_r);
- if (err < 0) {
- NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr");
+ if (err < 0)
goto destroy_err;
- }
- if (addr_l.family != addr_r.family) {
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (addr_l.addr.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) {
+ ipv6_addr_set_v4mapped(addr_l.addr.addr.s_addr, &addr_l.addr.addr6);
+ addr_l.addr.family = AF_INET6;
+ }
+ if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr.addr6)) {
+ ipv6_addr_set_v4mapped(addr_r.addr.s_addr, &addr_r.addr6);
+ addr_r.family = AF_INET6;
+ }
+#endif
+ if (addr_l.addr.family != addr_r.family) {
GENL_SET_ERR_MSG(info, "address families do not match");
err = -EINVAL;
goto destroy_err;
}
- if (!addr_l.port || !addr_r.port) {
- GENL_SET_ERR_MSG(info, "missing local or remote port");
+ if (!addr_l.addr.port) {
+ NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local port");
+ err = -EINVAL;
+ goto destroy_err;
+ }
+
+ if (!addr_r.port) {
+ NL_SET_ERR_MSG_ATTR(info->extack, raddr, "missing remote port");
err = -EINVAL;
goto destroy_err;
}
- sk = (struct sock *)msk;
lock_sock(sk);
- ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r);
- if (ssk) {
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
-
- mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN);
- mptcp_close_ssk(sk, ssk, subflow);
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
- err = 0;
- } else {
+ ssk = mptcp_nl_find_ssk(msk, &addr_l.addr, &addr_r);
+ if (!ssk) {
+ GENL_SET_ERR_MSG(info, "subflow not found");
err = -ESRCH;
+ goto release_sock;
}
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_userspace_pm_delete_local_addr(msk, &addr_l);
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN);
+ mptcp_close_ssk(sk, ssk, mptcp_subflow_ctx(ssk));
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
+release_sock:
release_sock(sk);
destroy_err:
- sock_put((struct sock *)msk);
+ sock_put(sk);
return err;
}
-int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token,
- struct mptcp_pm_addr_entry *loc,
- struct mptcp_pm_addr_entry *rem, u8 bkup)
+int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local,
+ struct genl_info *info)
{
+ struct mptcp_addr_info rem = { .family = AF_UNSPEC, };
+ struct mptcp_pm_addr_entry *entry;
+ struct nlattr *attr, *attr_rem;
struct mptcp_sock *msk;
int ret = -EINVAL;
- u32 token_val;
+ struct sock *sk;
+ u8 bkup = 0;
- token_val = nla_get_u32(token);
+ if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE))
+ return ret;
- msk = mptcp_token_get_sock(net, token_val);
+ msk = mptcp_userspace_pm_get_sock(info);
if (!msk)
return ret;
- if (!mptcp_pm_is_userspace(msk))
+ sk = (struct sock *)msk;
+
+ attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ if (local->addr.family == AF_UNSPEC) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "invalid local address family");
+ ret = -EINVAL;
+ goto set_flags_err;
+ }
+
+ attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+ ret = mptcp_pm_parse_addr(attr_rem, info, &rem);
+ if (ret < 0)
goto set_flags_err;
- if (loc->addr.family == AF_UNSPEC ||
- rem->addr.family == AF_UNSPEC)
+ if (rem.family == AF_UNSPEC) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr_rem,
+ "invalid remote address family");
+ ret = -EINVAL;
goto set_flags_err;
+ }
+
+ if (local->flags & MPTCP_PM_ADDR_FLAG_BACKUP)
+ bkup = 1;
+
+ spin_lock_bh(&msk->pm.lock);
+ entry = mptcp_userspace_pm_lookup_addr(msk, &local->addr);
+ if (entry) {
+ if (bkup)
+ entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
+ else
+ entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
+ }
+ spin_unlock_bh(&msk->pm.lock);
- lock_sock((struct sock *)msk);
- ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup);
- release_sock((struct sock *)msk);
+ lock_sock(sk);
+ ret = mptcp_pm_mp_prio_send_ack(msk, &local->addr, &rem, bkup);
+ release_sock(sk);
+
+ /* mptcp_pm_mp_prio_send_ack() only fails in one case */
+ if (ret < 0)
+ GENL_SET_ERR_MSG(info, "subflow not found");
set_flags_err:
- sock_put((struct sock *)msk);
+ sock_put(sk);
return ret;
}
+
+int mptcp_userspace_pm_dump_addr(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct id_bitmap {
+ DECLARE_BITMAP(map, MPTCP_PM_MAX_ADDR_ID + 1);
+ } *bitmap;
+ const struct genl_info *info = genl_info_dump(cb);
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_sock *msk;
+ int ret = -EINVAL;
+ struct sock *sk;
+
+ BUILD_BUG_ON(sizeof(struct id_bitmap) > sizeof(cb->ctx));
+
+ bitmap = (struct id_bitmap *)cb->ctx;
+
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
+ return ret;
+
+ sk = (struct sock *)msk;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_for_each_userspace_pm_addr(msk, entry) {
+ if (test_bit(entry->addr.id, bitmap->map))
+ continue;
+
+ if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0)
+ break;
+
+ __set_bit(entry->addr.id, bitmap->map);
+ }
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+ ret = msg->len;
+
+ sock_put(sk);
+ return ret;
+}
+
+int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr,
+ struct genl_info *info)
+{
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_sock *msk;
+ int ret = -EINVAL;
+ struct sock *sk;
+
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
+ return ret;
+
+ sk = (struct sock *)msk;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ entry = mptcp_userspace_pm_lookup_addr_by_id(msk, id);
+ if (entry) {
+ *addr = *entry;
+ ret = 0;
+ }
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+
+ sock_put(sk);
+ return ret;
+}
+
+static struct mptcp_pm_ops mptcp_pm_userspace = {
+ .name = "userspace",
+ .owner = THIS_MODULE,
+};
+
+void __init mptcp_pm_userspace_register(void)
+{
+ mptcp_pm_register(&mptcp_pm_userspace);
+}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8cd6cc67c2c5..e212c1374bd0 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -11,16 +11,18 @@
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/atomic.h>
+#include <net/aligned_data.h>
+#include <net/rps.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
#include <net/protocol.h>
-#include <net/tcp.h>
#include <net/tcp_states.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/transp_v6.h>
#endif
#include <net/mptcp.h>
+#include <net/hotdata.h>
#include <net/xfrm.h>
#include <asm/ioctls.h>
#include "protocol.h"
@@ -44,22 +46,12 @@ enum {
static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;
static void __mptcp_destroy_sock(struct sock *sk);
-static void __mptcp_check_send_data_fin(struct sock *sk);
+static void mptcp_check_send_data_fin(struct sock *sk);
-DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
-static struct net_device mptcp_napi_dev;
-
-/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
- * completed yet or has failed, return the subflow socket.
- * Otherwise return NULL.
- */
-struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
-{
- if (!msk->subflow || READ_ONCE(msk->can_ack))
- return NULL;
-
- return msk->subflow;
-}
+DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+static struct net_device *mptcp_napi_dev;
/* Returns end sequence number of the receiver's advertised window */
static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
@@ -67,28 +59,43 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
return READ_ONCE(msk->wnd_end);
}
-static bool mptcp_is_tcpsk(struct sock *sk)
+static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk)
{
- struct socket *sock = sk->sk_socket;
+ unsigned short family = READ_ONCE(sk->sk_family);
- if (unlikely(sk->sk_prot == &tcp_prot)) {
- /* we are being invoked after mptcp_accept() has
- * accepted a non-mp-capable flow: sk is a tcp_sk,
- * not an mptcp one.
- *
- * Hand the socket over to tcp so all further socket ops
- * bypass mptcp.
- */
- sock->ops = &inet_stream_ops;
- return true;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
- sock->ops = &inet6_stream_ops;
- return true;
+ if (family == AF_INET6)
+ return &inet6_stream_ops;
#endif
+ WARN_ON_ONCE(family != AF_INET);
+ return &inet_stream_ops;
+}
+
+bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib)
+{
+ struct net *net = sock_net((struct sock *)msk);
+
+ if (__mptcp_check_fallback(msk))
+ return true;
+
+ /* The caller possibly is not holding the msk socket lock, but
+ * in the fallback case only the current subflow is touching
+ * the OoO queue.
+ */
+ if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
+ return false;
+
+ spin_lock_bh(&msk->fallback_lock);
+ if (!msk->allow_infinite_fallback) {
+ spin_unlock_bh(&msk->fallback_lock);
+ return false;
}
- return false;
+ msk->allow_subflows = false;
+ set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+ __MPTCP_INC_STATS(net, fb_mib);
+ spin_unlock_bh(&msk->fallback_lock);
+ return true;
}
static int __mptcp_socket_create(struct mptcp_sock *msk)
@@ -102,29 +109,65 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
if (err)
return err;
- msk->first = ssock->sk;
- msk->subflow = ssock;
+ msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio;
+ WRITE_ONCE(msk->first, ssock->sk);
subflow = mptcp_subflow_ctx(ssock->sk);
list_add(&subflow->node, &msk->conn_list);
sock_hold(ssock->sk);
subflow->request_mptcp = 1;
+ subflow->subflow_id = msk->subflow_id++;
/* This is the first subflow, always with id 0 */
- subflow->local_id_valid = 1;
+ WRITE_ONCE(subflow->local_id, 0);
mptcp_sock_graft(msk->first, sk->sk_socket);
+ iput(SOCK_INODE(ssock));
return 0;
}
+/* If the MPC handshake is not started, returns the first subflow,
+ * eventually allocating it.
+ */
+struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ int ret;
+
+ if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ return ERR_PTR(-EINVAL);
+
+ if (!msk->first) {
+ ret = __mptcp_socket_create(msk);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ return msk->first;
+}
+
static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
{
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
__kfree_skb(skb);
}
-static void mptcp_rmem_charge(struct sock *sk, int size)
+static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
+ struct sk_buff *from, bool *fragstolen,
+ int *delta)
{
- mptcp_sk(sk)->rmem_fwd_alloc -= size;
+ int limit = READ_ONCE(sk->sk_rcvbuf);
+
+ if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) ||
+ MPTCP_SKB_CB(from)->offset ||
+ ((to->len + from->len) > (limit >> 3)) ||
+ !skb_try_coalesce(to, from, fragstolen, delta))
+ return false;
+
+ pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n",
+ MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
+ to->len, MPTCP_SKB_CB(from)->end_seq);
+ MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
+ return true;
}
static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
@@ -133,21 +176,15 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
bool fragstolen;
int delta;
- if (MPTCP_SKB_CB(from)->offset ||
- !skb_try_coalesce(to, from, &fragstolen, &delta))
+ if (!__mptcp_try_coalesce(sk, to, from, &fragstolen, &delta))
return false;
- pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
- MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
- to->len, MPTCP_SKB_CB(from)->end_seq);
- MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
-
/* note the fwd memory can reach a negative value after accounting
* for the delta, but the later skb free will restore a non
* negative one
*/
atomic_add(delta, &sk->sk_rmem_alloc);
- mptcp_rmem_charge(sk, delta);
+ sk_mem_charge(sk, delta);
kfree_skb_partial(from, fragstolen);
return true;
@@ -162,42 +199,42 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
return mptcp_try_coalesce((struct sock *)msk, to, from);
}
-static void __mptcp_rmem_reclaim(struct sock *sk, int amount)
-{
- amount >>= PAGE_SHIFT;
- mptcp_sk(sk)->rmem_fwd_alloc -= amount << PAGE_SHIFT;
- __sk_mem_reduce_allocated(sk, amount);
-}
-
-static void mptcp_rmem_uncharge(struct sock *sk, int size)
+/* "inspired" by tcp_rcvbuf_grow(), main difference:
+ * - mptcp does not maintain a msk-level window clamp
+ * - returns true when the receive buffer is actually updated
+ */
+static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- int reclaimable;
+ const struct net *net = sock_net(sk);
+ u32 rcvwin, rcvbuf, cap, oldval;
+ u64 grow;
+
+ oldval = msk->rcvq_space.space;
+ msk->rcvq_space.space = newval;
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+ (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ return false;
- msk->rmem_fwd_alloc += size;
- reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
+ /* DRS is always one RTT late. */
+ rcvwin = newval << 1;
- /* see sk_mem_uncharge() for the rationale behind the following schema */
- if (unlikely(reclaimable >= PAGE_SIZE))
- __mptcp_rmem_reclaim(sk, reclaimable);
-}
+ /* slow start: allow the sender to double its rate. */
+ grow = (u64)rcvwin * (newval - oldval);
+ do_div(grow, oldval);
+ rcvwin += grow << 1;
-static void mptcp_rfree(struct sk_buff *skb)
-{
- unsigned int len = skb->truesize;
- struct sock *sk = skb->sk;
+ if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
+ rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
- atomic_sub(len, &sk->sk_rmem_alloc);
- mptcp_rmem_uncharge(sk, len);
-}
+ cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
-void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
-{
- skb_orphan(skb);
- skb->sk = sk;
- skb->destructor = mptcp_rfree;
- atomic_add(skb->truesize, &sk->sk_rmem_alloc);
- mptcp_rmem_charge(sk, skb->truesize);
+ rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+ return true;
+ }
+ return false;
}
/* "inspired" by tcp_data_queue_ofo(), main differences:
@@ -215,7 +252,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
end_seq = MPTCP_SKB_CB(skb)->end_seq;
max_seq = atomic64_read(&msk->rcv_wnd_sent);
- pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
+ pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq,
RB_EMPTY_ROOT(&msk->out_of_order_queue));
if (after64(end_seq, max_seq)) {
/* out of window */
@@ -312,46 +349,17 @@ merge_right:
end:
skb_condense(skb);
- mptcp_set_owner_r(skb, sk);
+ skb_set_owner_r(skb, sk);
+ /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
+ if (sk->sk_socket)
+ mptcp_rcvbuf_grow(sk, msk->rcvq_space.space);
}
-static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
- int amt, amount;
-
- if (size <= msk->rmem_fwd_alloc)
- return true;
-
- size -= msk->rmem_fwd_alloc;
- amt = sk_mem_pages(size);
- amount = amt << PAGE_SHIFT;
- if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV))
- return false;
-
- msk->rmem_fwd_alloc += amount;
- return true;
-}
-
-static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
- struct sk_buff *skb, unsigned int offset,
- size_t copy_len)
+static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
+ int copy_len)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- struct sock *sk = (struct sock *)msk;
- struct sk_buff *tail;
- bool has_rxtstamp;
-
- __skb_unlink(skb, &ssk->sk_receive_queue);
-
- skb_ext_reset(skb);
- skb_orphan(skb);
-
- /* try to fetch required memory from subflow */
- if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
- goto drop;
-
- has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
+ bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
/* the skb map_seq accounts for the skb offset:
* mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
@@ -361,15 +369,31 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
MPTCP_SKB_CB(skb)->offset = offset;
MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
+ MPTCP_SKB_CB(skb)->cant_coalesce = 0;
+
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+
+ skb_ext_reset(skb);
+ skb_dst_drop(skb);
+}
+
+static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
+{
+ u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *tail;
+
+ mptcp_borrow_fwdmem(sk, skb);
if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
/* in sequence */
+ msk->bytes_received += copy_len;
WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
tail = skb_peek_tail(&sk->sk_receive_queue);
if (tail && mptcp_try_coalesce(sk, tail, skb))
return true;
- mptcp_set_owner_r(skb, sk);
+ skb_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
return true;
} else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
@@ -381,16 +405,13 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
* will retransmit as needed, if needed.
*/
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
-drop:
mptcp_drop(sk, skb);
return false;
}
-static void mptcp_stop_timer(struct sock *sk)
+static void mptcp_stop_rtx_timer(struct sock *sk)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer(sk, &sk->mptcp_retransmit_timer);
mptcp_sk(sk)->timer_ival = 0;
}
@@ -407,12 +428,26 @@ static void mptcp_close_wake_up(struct sock *sk)
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}
+static void mptcp_shutdown_subflows(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow;
+
+ slow = lock_sock_fast(ssk);
+ tcp_shutdown(ssk, SEND_SHUTDOWN);
+ unlock_sock_fast(ssk, slow);
+ }
+}
+
+/* called under the msk socket lock */
static bool mptcp_pending_data_fin_ack(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- return !__mptcp_check_fallback(msk) &&
- ((1 << sk->sk_state) &
+ return ((1 << sk->sk_state) &
(TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
msk->write_seq == READ_ONCE(msk->snd_una);
}
@@ -427,11 +462,12 @@ static void mptcp_check_data_fin_ack(struct sock *sk)
switch (sk->sk_state) {
case TCP_FIN_WAIT1:
- inet_sk_state_store(sk, TCP_FIN_WAIT2);
+ mptcp_set_state(sk, TCP_FIN_WAIT2);
break;
case TCP_CLOSING:
case TCP_LAST_ACK:
- inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_shutdown_subflows(msk);
+ mptcp_set_state(sk, TCP_CLOSE);
break;
}
@@ -439,16 +475,17 @@ static void mptcp_check_data_fin_ack(struct sock *sk)
}
}
+/* can be called with no lock acquired */
static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
{
struct mptcp_sock *msk = mptcp_sk(sk);
if (READ_ONCE(msk->rcv_data_fin) &&
- ((1 << sk->sk_state) &
+ ((1 << inet_sk_state_load(sk)) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) {
u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq);
- if (msk->ack_seq == rcv_data_fin_seq) {
+ if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) {
if (seq)
*seq = rcv_data_fin_seq;
@@ -459,7 +496,7 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
return false;
}
-static void mptcp_set_datafin_timeout(const struct sock *sk)
+static void mptcp_set_datafin_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 retransmits;
@@ -480,7 +517,7 @@ static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subfl
const struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
return inet_csk(ssk)->icsk_pending && !subflow->stale_count ?
- inet_csk(ssk)->icsk_timeout - jiffies : 0;
+ tcp_timeout_expires(ssk) - jiffies : 0;
}
static void mptcp_set_timeout(struct sock *sk)
@@ -522,13 +559,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk)
mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow));
}
-static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
+static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied)
{
bool slow;
slow = lock_sock_fast(ssk);
if (tcp_can_send_ack(ssk))
- tcp_cleanup_rbuf(ssk, 1);
+ tcp_cleanup_rbuf(ssk, copied);
unlock_sock_fast(ssk, slow);
}
@@ -545,7 +582,7 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
(ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED)));
}
-static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
+static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied)
{
int old_space = READ_ONCE(msk->old_wspace);
struct mptcp_subflow_context *subflow;
@@ -553,25 +590,21 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
int space = __mptcp_space(sk);
bool cleanup, rx_empty;
- cleanup = (space > 0) && (space >= (old_space << 1));
- rx_empty = !__mptcp_rmem(sk);
+ cleanup = (space > 0) && (space >= (old_space << 1)) && copied;
+ rx_empty = !sk_rmem_alloc_get(sk) && copied;
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty))
- mptcp_subflow_cleanup_rbuf(ssk);
+ mptcp_subflow_cleanup_rbuf(ssk, copied);
}
}
-static bool mptcp_check_data_fin(struct sock *sk)
+static void mptcp_check_data_fin(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
u64 rcv_data_fin_seq;
- bool ret = false;
-
- if (__mptcp_check_fallback(msk))
- return ret;
/* Need to ack a DATA_FIN received from a peer while this side
* of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
@@ -590,18 +623,19 @@ static bool mptcp_check_data_fin(struct sock *sk)
WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1);
WRITE_ONCE(msk->rcv_data_fin, 0);
- sk->sk_shutdown |= RCV_SHUTDOWN;
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
switch (sk->sk_state) {
case TCP_ESTABLISHED:
- inet_sk_state_store(sk, TCP_CLOSE_WAIT);
+ mptcp_set_state(sk, TCP_CLOSE_WAIT);
break;
case TCP_FIN_WAIT1:
- inet_sk_state_store(sk, TCP_CLOSING);
+ mptcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
- inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_shutdown_subflows(msk);
+ mptcp_set_state(sk, TCP_CLOSE);
break;
default:
/* Other states not expected */
@@ -609,37 +643,72 @@ static bool mptcp_check_data_fin(struct sock *sk)
break;
}
- ret = true;
- mptcp_send_ack(msk);
+ if (!__mptcp_check_fallback(msk))
+ mptcp_send_ack(msk);
mptcp_close_wake_up(sk);
}
- return ret;
+}
+
+static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk)
+{
+ if (!mptcp_try_fallback(ssk, MPTCP_MIB_DSSCORRUPTIONFALLBACK)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET);
+ mptcp_subflow_reset(ssk);
+ }
+}
+
+static void __mptcp_add_backlog(struct sock *sk,
+ struct mptcp_subflow_context *subflow,
+ struct sk_buff *skb)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *tail = NULL;
+ struct sock *ssk = skb->sk;
+ bool fragstolen;
+ int delta;
+
+ if (unlikely(sk->sk_state == TCP_CLOSE)) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
+ return;
+ }
+
+ /* Try to coalesce with the last skb in our backlog */
+ if (!list_empty(&msk->backlog_list))
+ tail = list_last_entry(&msk->backlog_list, struct sk_buff, list);
+
+ if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)->end_seq &&
+ ssk == tail->sk &&
+ __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) {
+ skb->truesize -= delta;
+ kfree_skb_partial(skb, fragstolen);
+ __mptcp_subflow_lend_fwdmem(subflow, delta);
+ goto account;
+ }
+
+ list_add_tail(&skb->list, &msk->backlog_list);
+ mptcp_subflow_lend_fwdmem(subflow, skb);
+ delta = skb->truesize;
+
+account:
+ WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta);
+
+ /* Possibly not accept()ed yet, keep track of memory not CG
+ * accounted, mptcp_graft_subflows() will handle it.
+ */
+ if (!mem_cgroup_from_sk(ssk))
+ msk->backlog_unaccounted += delta;
}
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
- struct sock *ssk,
- unsigned int *bytes)
+ struct sock *ssk, bool own_msk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = (struct sock *)msk;
- unsigned int moved = 0;
bool more_data_avail;
struct tcp_sock *tp;
- bool done = false;
- int sk_rbuf;
-
- sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
-
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
-
- if (unlikely(ssk_rbuf > sk_rbuf)) {
- WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf);
- sk_rbuf = ssk_rbuf;
- }
- }
+ bool ret = false;
- pr_debug("msk=%p ssk=%p", msk, ssk);
+ pr_debug("msk=%p ssk=%p\n", msk, ssk);
tp = tcp_sk(ssk);
do {
u32 map_remaining, offset;
@@ -652,15 +721,8 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
mptcp_subflow_get_map_offset(subflow);
skb = skb_peek(&ssk->sk_receive_queue);
- if (!skb) {
- /* With racing move_skbs_to_msk() and __mptcp_move_skbs(),
- * a different CPU can have already processed the pending
- * data, stop here or we can enter an infinite loop
- */
- if (!moved)
- done = true;
+ if (unlikely(!skb))
break;
- }
if (__mptcp_check_fallback(msk)) {
/* Under fallback skbs have no MPTCP extension and TCP could
@@ -673,40 +735,43 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
offset = seq - TCP_SKB_CB(skb)->seq;
fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
- if (fin) {
- done = true;
+ if (fin)
seq++;
- }
if (offset < skb->len) {
size_t len = skb->len - offset;
- if (tp->urg_data)
- done = true;
+ mptcp_init_skb(ssk, skb, offset, len);
- if (__mptcp_move_skb(msk, ssk, skb, offset, len))
- moved += len;
+ if (own_msk && sk_rmem_alloc_get(sk) < sk->sk_rcvbuf) {
+ mptcp_subflow_lend_fwdmem(subflow, skb);
+ ret |= __mptcp_move_skb(sk, skb);
+ } else {
+ __mptcp_add_backlog(sk, subflow, skb);
+ }
seq += len;
- if (WARN_ON_ONCE(map_remaining < len))
- break;
+ if (unlikely(map_remaining < len)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ mptcp_dss_corruption(msk, ssk);
+ }
} else {
- WARN_ON_ONCE(!fin);
+ if (unlikely(!fin)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ mptcp_dss_corruption(msk, ssk);
+ }
+
sk_eat_skb(ssk, skb);
- done = true;
}
WRITE_ONCE(tp->copied_seq, seq);
more_data_avail = mptcp_subflow_data_available(ssk);
- if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) {
- done = true;
- break;
- }
} while (more_data_avail);
- *bytes += moved;
- return done;
+ if (ret)
+ msk->last_data_recv = tcp_jiffies32;
+ return ret;
}
static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
@@ -718,7 +783,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
u64 end_seq;
p = rb_first(&msk->out_of_order_queue);
- pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ pr_debug("msk=%p empty=%d\n", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
while (p) {
skb = rb_to_skb(p);
if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
@@ -740,35 +805,72 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
/* skip overlapping data, if any */
- pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
+ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d\n",
MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
delta);
MPTCP_SKB_CB(skb)->offset += delta;
MPTCP_SKB_CB(skb)->map_seq += delta;
__skb_queue_tail(&sk->sk_receive_queue, skb);
}
- msk->ack_seq = end_seq;
+ msk->bytes_received += end_seq - msk->ack_seq;
+ WRITE_ONCE(msk->ack_seq, end_seq);
moved = true;
}
return moved;
}
+static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
+{
+ int err = sock_error(ssk);
+ int ssk_state;
+
+ if (!err)
+ return false;
+
+ /* only propagate errors on fallen-back sockets or
+ * on MPC connect
+ */
+ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
+ return false;
+
+ /* We need to propagate only transition to CLOSE state.
+ * Orphaned socket will see such state change via
+ * subflow_sched_work_if_closed() and that path will properly
+ * destroy the msk as needed.
+ */
+ ssk_state = inet_sk_state_load(ssk);
+ if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
+ mptcp_set_state(sk, ssk_state);
+ WRITE_ONCE(sk->sk_err, -err);
+
+ /* This barrier is coupled with smp_rmb() in mptcp_poll() */
+ smp_wmb();
+ sk_error_report(sk);
+ return true;
+}
+
+void __mptcp_error_report(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_for_each_subflow(msk, subflow)
+ if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow)))
+ break;
+}
+
/* In most cases we will be able to lock the mptcp socket. If its already
* owned, we need to defer to the work queue to avoid ABBA deadlock.
*/
static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
{
struct sock *sk = (struct sock *)msk;
- unsigned int moved = 0;
+ bool moved;
- __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ moved = __mptcp_move_skbs_from_subflow(msk, ssk, true);
__mptcp_ofo_queue(msk);
- if (unlikely(ssk->sk_err)) {
- if (!sock_owned_by_user(sk))
- __mptcp_error_report(sk);
- else
- __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags);
- }
+ if (unlikely(ssk->sk_err))
+ __mptcp_subflow_error_report(sk, ssk);
/* If the moves have caught up with the DATA_FIN sequence number
* it's time to ack the DATA_FIN and change socket state, but
@@ -777,41 +879,39 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
*/
if (mptcp_pending_data_fin(sk, NULL))
mptcp_schedule_work(sk);
- return moved > 0;
+ return moved;
}
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(sk);
- int sk_rbuf, ssk_rbuf;
/* The peer can send data while we are shutting down this
- * subflow at msk destruction time, but we must avoid enqueuing
+ * subflow at subflow destruction time, but we must avoid enqueuing
* more data to the msk receive queue
*/
- if (unlikely(subflow->disposable))
+ if (unlikely(subflow->closing))
return;
- ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
- sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
- if (unlikely(ssk_rbuf > sk_rbuf))
- sk_rbuf = ssk_rbuf;
-
- /* over limit? can't append more skbs to msk, Also, no need to wake-up*/
- if (__mptcp_rmem(sk) > sk_rbuf) {
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
- return;
- }
-
- /* Wake-up the reader only for in-sequence data */
mptcp_data_lock(sk);
- if (move_skbs_to_msk(msk, ssk))
- sk->sk_data_ready(sk);
-
+ if (!sock_owned_by_user(sk)) {
+ /* Wake-up the reader only for in-sequence data */
+ if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
+ sk->sk_data_ready(sk);
+ } else {
+ __mptcp_move_skbs_from_subflow(msk, ssk, false);
+ }
mptcp_data_unlock(sk);
}
+static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk)
+{
+ mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq);
+ msk->allow_infinite_fallback = false;
+ mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
+}
+
static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
{
struct sock *sk = (struct sock *)msk;
@@ -819,23 +919,27 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
if (sk->sk_state != TCP_ESTABLISHED)
return false;
- /* attach to msk socket only after we are sure we will deal with it
- * at close time
- */
- if (sk->sk_socket && !ssk->sk_socket)
- mptcp_sock_graft(ssk, sk->sk_socket);
+ spin_lock_bh(&msk->fallback_lock);
+ if (!msk->allow_subflows) {
+ spin_unlock_bh(&msk->fallback_lock);
+ return false;
+ }
+ mptcp_subflow_joined(msk, ssk);
+ spin_unlock_bh(&msk->fallback_lock);
- mptcp_propagate_sndbuf((struct sock *)msk, ssk);
+ mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++;
mptcp_sockopt_sync_locked(msk, ssk);
+ mptcp_stop_tout_timer(sk);
+ __mptcp_propagate_sndbuf(sk, ssk);
return true;
}
-static void __mptcp_flush_join_list(struct sock *sk)
+static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list)
{
struct mptcp_subflow_context *tmp, *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
- list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) {
+ list_for_each_entry_safe(subflow, tmp, join_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast(ssk);
@@ -846,14 +950,13 @@ static void __mptcp_flush_join_list(struct sock *sk)
}
}
-static bool mptcp_timer_pending(struct sock *sk)
+static bool mptcp_rtx_timer_pending(struct sock *sk)
{
- return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
+ return timer_pending(&sk->mptcp_retransmit_timer);
}
-static void mptcp_reset_timer(struct sock *sk)
+static void mptcp_reset_rtx_timer(struct sock *sk)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
unsigned long tout;
/* prevent rescheduling on close */
@@ -861,78 +964,25 @@ static void mptcp_reset_timer(struct sock *sk)
return;
tout = mptcp_sk(sk)->timer_ival;
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
+ sk_reset_timer(sk, &sk->mptcp_retransmit_timer, jiffies + tout);
}
bool mptcp_schedule_work(struct sock *sk)
{
- if (inet_sk_state_load(sk) != TCP_CLOSE &&
- schedule_work(&mptcp_sk(sk)->work)) {
- /* each subflow already holds a reference to the sk, and the
- * workqueue is invoked by a subflow, so sk can't go away here.
- */
- sock_hold(sk);
- return true;
- }
- return false;
-}
-
-void mptcp_subflow_eof(struct sock *sk)
-{
- if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags))
- mptcp_schedule_work(sk);
-}
-
-static void mptcp_check_for_eof(struct mptcp_sock *msk)
-{
- struct mptcp_subflow_context *subflow;
- struct sock *sk = (struct sock *)msk;
- int receivers = 0;
-
- mptcp_for_each_subflow(msk, subflow)
- receivers += !subflow->rx_eof;
- if (receivers)
- return;
-
- if (!(sk->sk_shutdown & RCV_SHUTDOWN)) {
- /* hopefully temporary hack: propagate shutdown status
- * to msk, when all subflows agree on it
- */
- sk->sk_shutdown |= RCV_SHUTDOWN;
-
- smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
- sk->sk_data_ready(sk);
- }
-
- switch (sk->sk_state) {
- case TCP_ESTABLISHED:
- inet_sk_state_store(sk, TCP_CLOSE_WAIT);
- break;
- case TCP_FIN_WAIT1:
- inet_sk_state_store(sk, TCP_CLOSING);
- break;
- case TCP_FIN_WAIT2:
- inet_sk_state_store(sk, TCP_CLOSE);
- break;
- default:
- return;
- }
- mptcp_close_wake_up(sk);
-}
-
-static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
-{
- struct mptcp_subflow_context *subflow;
- struct sock *sk = (struct sock *)msk;
+ if (inet_sk_state_load(sk) == TCP_CLOSE)
+ return false;
- sock_owned_by_me(sk);
+ /* Get a reference on this socket, mptcp_worker() will release it.
+ * As mptcp_worker() might complete before us, we can not avoid
+ * a sock_hold()/sock_put() if schedule_work() returns false.
+ */
+ sock_hold(sk);
- mptcp_for_each_subflow(msk, subflow) {
- if (READ_ONCE(subflow->data_avail))
- return mptcp_subflow_tcp_sock(subflow);
- }
+ if (schedule_work(&mptcp_sk(sk)->work))
+ return true;
- return NULL;
+ sock_put(sk);
+ return false;
}
static bool mptcp_skb_can_collapse_to(u64 write_seq,
@@ -979,18 +1029,13 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
put_page(dfrag->page);
}
+/* called under both the msk socket lock and the data lock */
static void __mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
u64 snd_una;
- /* on fallback we just need to ignore snd_una, as this is really
- * plain TCP
- */
- if (__mptcp_check_fallback(msk))
- msk->snd_una = READ_ONCE(msk->snd_nxt);
-
snd_una = msk->snd_una;
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
@@ -1001,7 +1046,7 @@ static void __mptcp_clean_una(struct sock *sk)
if (WARN_ON_ONCE(!msk->recovery))
break;
- WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ msk->first_pending = mptcp_send_next(sk);
}
dfrag_clear(sk, dfrag);
@@ -1033,13 +1078,15 @@ static void __mptcp_clean_una(struct sock *sk)
msk->recovery = false;
out:
- if (snd_una == READ_ONCE(msk->snd_nxt) &&
- snd_una == READ_ONCE(msk->write_seq)) {
- if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
- mptcp_stop_timer(sk);
+ if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) {
+ if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
+ mptcp_stop_rtx_timer(sk);
} else {
- mptcp_reset_timer(sk);
+ mptcp_reset_rtx_timer(sk);
}
+
+ if (mptcp_pending_data_fin_ack(sk))
+ mptcp_schedule_work(sk);
}
static void __mptcp_clean_una_wakeup(struct sock *sk)
@@ -1063,15 +1110,17 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(sk);
bool first = true;
- sk_stream_moderate_sndbuf(sk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- if (first)
+ if (first && !ssk->sk_bypass_prot_mem) {
tcp_enter_memory_pressure(ssk);
+ first = false;
+ }
+
sk_stream_moderate_sndbuf(ssk);
- first = false;
}
+ __mptcp_sync_sndbuf(sk);
}
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
@@ -1209,12 +1258,17 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk,
mpext->infinite_map = 1;
mpext->data_len = 0;
- MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
+ if (!mptcp_try_fallback(ssk, MPTCP_MIB_INFINITEMAPTX)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_FALLBACKFAILED);
+ mptcp_subflow_reset(ssk);
+ return;
+ }
+
mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
- pr_fallback(msk);
- mptcp_do_fallback(ssk);
}
+#define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1))
+
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_data_frag *dfrag,
struct mptcp_sendmsg_info *info)
@@ -1230,7 +1284,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
size_t copy;
int i;
- pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
+ pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u\n",
msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
if (WARN_ON_ONCE(info->sent > info->limit ||
@@ -1241,6 +1295,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
return -EAGAIN;
/* compute send limit */
+ if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE))
+ ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE;
info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
copy = info->size_goal;
@@ -1252,15 +1308,16 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* queue management operation, to avoid breaking the ext <->
* SSN association set here
*/
- mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+ mpext = mptcp_get_ext(skb);
if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) {
TCP_SKB_CB(skb)->eor = 1;
+ tcp_mark_push(tcp_sk(ssk), skb);
goto alloc_skb;
}
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
- if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
+ if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
tcp_mark_push(tcp_sk(ssk), skb);
goto alloc_skb;
}
@@ -1274,7 +1331,7 @@ alloc_skb:
i = skb_shinfo(skb)->nr_frags;
reuse_skb = false;
- mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+ mpext = mptcp_get_ext(skb);
}
/* Zero window and all data acked? Probe. */
@@ -1282,7 +1339,12 @@ alloc_skb:
if (copy == 0) {
u64 snd_una = READ_ONCE(msk->snd_una);
- if (snd_una != msk->snd_nxt) {
+ /* No need for zero probe if there are any data pending
+ * either at the msk or ssk level; skb is the current write
+ * queue tail and can be empty at this point.
+ */
+ if (snd_una != msk->snd_nxt || skb->len ||
+ skb != tcp_send_head(ssk)) {
tcp_remove_empty_skb(ssk);
return 0;
}
@@ -1290,11 +1352,6 @@ alloc_skb:
zero_window_probe = true;
data_seq = snd_una - 1;
copy = 1;
-
- /* all mptcp-level data is acked, no skbs should be present into the
- * ssk write queue
- */
- WARN_ON_ONCE(reuse_skb);
}
copy = min_t(size_t, copy, info->limit - info->sent);
@@ -1323,7 +1380,6 @@ alloc_skb:
if (reuse_skb) {
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
mpext->data_len += copy;
- WARN_ON_ONCE(zero_window_probe);
goto out;
}
@@ -1334,11 +1390,12 @@ alloc_skb:
mpext->use_map = 1;
mpext->dsn64 = 1;
- pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d\n",
mpext->data_seq, mpext->subflow_seq, mpext->data_len,
mpext->dsn64);
if (zero_window_probe) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_WINPROBE);
mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
mpext->frozen = 1;
if (READ_ONCE(msk->csum_enabled))
@@ -1397,7 +1454,7 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
* returns the subflow that will transmit the next DSS
* additionally updates the rtx timeout
*/
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct subflow_send_info send_info[SSK_MODE_MAX];
struct mptcp_subflow_context *subflow;
@@ -1408,23 +1465,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
u64 linger_time;
long tout = 0;
- sock_owned_by_me(sk);
-
- if (__mptcp_check_fallback(msk)) {
- if (!msk->first)
- return NULL;
- return __tcp_can_send(msk->first) &&
- sk_stream_memory_free(msk->first) ? msk->first : NULL;
- }
-
- /* re-use last subflow, if the burst allow that */
- if (msk->last_snd && msk->snd_burst > 0 &&
- sk_stream_memory_free(msk->last_snd) &&
- mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
- mptcp_set_timeout(sk);
- return msk->last_snd;
- }
-
/* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < SSK_MODE_MAX; ++i) {
send_info[i].ssk = NULL;
@@ -1432,13 +1472,15 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
}
mptcp_for_each_subflow(msk, subflow) {
+ bool backup = subflow->backup || subflow->request_bkup;
+
trace_mptcp_subflow_get_send(subflow);
ssk = mptcp_subflow_tcp_sock(subflow);
if (!mptcp_subflow_active(subflow))
continue;
tout = max(tout, mptcp_timeout_from_subflow(subflow));
- nr_active += !subflow->backup;
+ nr_active += !backup;
pace = subflow->avg_pacing_rate;
if (unlikely(!pace)) {
/* init pacing rate from socket */
@@ -1449,9 +1491,9 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
}
linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
- if (linger_time < send_info[subflow->backup].linger_time) {
- send_info[subflow->backup].ssk = ssk;
- send_info[subflow->backup].linger_time = linger_time;
+ if (linger_time < send_info[backup].linger_time) {
+ send_info[backup].ssk = ssk;
+ send_info[backup].linger_time = linger_time;
}
}
__mptcp_set_timeout(sk, tout);
@@ -1465,7 +1507,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
* - estimate the faster flow linger time
* - use the above to estimate the amount of byte transferred
* by the faster flow
- * - check that the amount of queued data is greter than the above,
+ * - check that the amount of queued data is greater than the above,
* otherwise do not use the picked, slower, subflow
* We select the subflow with the shorter estimated time to flush
* the queued mem, which basically ensure the above. We just need
@@ -1477,16 +1519,13 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
wmem = READ_ONCE(ssk->sk_wmem_queued);
- if (!burst) {
- msk->last_snd = NULL;
+ if (!burst)
return ssk;
- }
subflow = mptcp_subflow_ctx(ssk);
subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
READ_ONCE(ssk->sk_pacing_rate) * burst,
burst + wmem);
- msk->last_snd = ssk;
msk->snd_burst = burst;
return ssk;
}
@@ -1518,79 +1557,128 @@ static void mptcp_update_post_push(struct mptcp_sock *msk,
* that has been handed to the subflow for transmission
* and skip update in case it was old dfrag.
*/
- if (likely(after64(snd_nxt_new, msk->snd_nxt)))
- msk->snd_nxt = snd_nxt_new;
+ if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
+ msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
+ WRITE_ONCE(msk->snd_nxt, snd_nxt_new);
+ }
}
void mptcp_check_and_set_pending(struct sock *sk)
{
- if (mptcp_send_head(sk))
- mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
+ if (mptcp_send_head(sk)) {
+ mptcp_data_lock(sk);
+ mptcp_sk(sk)->cb_flags |= BIT(MPTCP_PUSH_PENDING);
+ mptcp_data_unlock(sk);
+ }
}
-void __mptcp_push_pending(struct sock *sk, unsigned int flags)
+static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
+ struct mptcp_sendmsg_info *info)
{
- struct sock *prev_ssk = NULL, *ssk = NULL;
struct mptcp_sock *msk = mptcp_sk(sk);
- struct mptcp_sendmsg_info info = {
- .flags = flags,
- };
- bool do_check_data_fin = false;
struct mptcp_data_frag *dfrag;
- int len;
+ int len, copied = 0, err = 0;
while ((dfrag = mptcp_send_head(sk))) {
- info.sent = dfrag->already_sent;
- info.limit = dfrag->data_len;
+ info->sent = dfrag->already_sent;
+ info->limit = dfrag->data_len;
len = dfrag->data_len - dfrag->already_sent;
while (len > 0) {
int ret = 0;
- prev_ssk = ssk;
- ssk = mptcp_subflow_get_send(msk);
-
- /* First check. If the ssk has changed since
- * the last round, release prev_ssk
- */
- if (ssk != prev_ssk && prev_ssk)
- mptcp_push_release(prev_ssk, &info);
- if (!ssk)
- goto out;
-
- /* Need to lock the new subflow only if different
- * from the previous one, otherwise we are still
- * helding the relevant lock
- */
- if (ssk != prev_ssk)
- lock_sock(ssk);
-
- ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info);
if (ret <= 0) {
- if (ret == -EAGAIN)
- continue;
- mptcp_push_release(ssk, &info);
+ err = copied ? : ret;
goto out;
}
- do_check_data_fin = true;
- info.sent += ret;
+ info->sent += ret;
+ copied += ret;
len -= ret;
mptcp_update_post_push(msk, dfrag, ret);
}
- WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ msk->first_pending = mptcp_send_next(sk);
+
+ if (msk->snd_burst <= 0 ||
+ !sk_stream_memory_free(ssk) ||
+ !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
+ err = copied;
+ goto out;
+ }
+ mptcp_set_timeout(sk);
+ }
+ err = copied;
+
+out:
+ if (err > 0)
+ msk->last_data_sent = tcp_jiffies32;
+ return err;
+}
+
+void __mptcp_push_pending(struct sock *sk, unsigned int flags)
+{
+ struct sock *prev_ssk = NULL, *ssk = NULL;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_sendmsg_info info = {
+ .flags = flags,
+ };
+ bool do_check_data_fin = false;
+ int push_count = 1;
+
+ while (mptcp_send_head(sk) && (push_count > 0)) {
+ struct mptcp_subflow_context *subflow;
+ int ret = 0;
+
+ if (mptcp_sched_get_send(msk))
+ break;
+
+ push_count = 0;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ mptcp_subflow_set_scheduled(subflow, false);
+
+ prev_ssk = ssk;
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (ssk != prev_ssk) {
+ /* First check. If the ssk has changed since
+ * the last round, release prev_ssk
+ */
+ if (prev_ssk)
+ mptcp_push_release(prev_ssk, &info);
+
+ /* Need to lock the new subflow only if different
+ * from the previous one, otherwise we are still
+ * helding the relevant lock
+ */
+ lock_sock(ssk);
+ }
+
+ push_count++;
+
+ ret = __subflow_push_pending(sk, ssk, &info);
+ if (ret <= 0) {
+ if (ret != -EAGAIN ||
+ (1 << ssk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE))
+ push_count--;
+ continue;
+ }
+ do_check_data_fin = true;
+ }
+ }
}
/* at this point we held the socket lock for the last subflow we used */
if (ssk)
mptcp_push_release(ssk, &info);
-out:
/* ensure the rtx timer is running */
- if (!mptcp_timer_pending(sk))
- mptcp_reset_timer(sk);
+ if (!mptcp_rtx_timer_pending(sk))
+ mptcp_reset_rtx_timer(sk);
if (do_check_data_fin)
- __mptcp_check_send_data_fin(sk);
+ mptcp_check_send_data_fin(sk);
}
static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first)
@@ -1599,42 +1687,49 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
struct mptcp_sendmsg_info info = {
.data_lock_held = true,
};
- struct mptcp_data_frag *dfrag;
+ bool keep_pushing = true;
struct sock *xmit_ssk;
- int len, copied = 0;
+ int copied = 0;
info.flags = 0;
- while ((dfrag = mptcp_send_head(sk))) {
- info.sent = dfrag->already_sent;
- info.limit = dfrag->data_len;
- len = dfrag->data_len - dfrag->already_sent;
- while (len > 0) {
- int ret = 0;
+ while (mptcp_send_head(sk) && keep_pushing) {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ int ret = 0;
- /* check for a different subflow usage only after
- * spooling the first chunk of data
- */
- xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
- if (!xmit_ssk)
- goto out;
- if (xmit_ssk != ssk) {
- mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
- MPTCP_DELEGATE_SEND);
- goto out;
- }
-
- ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ /* check for a different subflow usage only after
+ * spooling the first chunk of data
+ */
+ if (first) {
+ mptcp_subflow_set_scheduled(subflow, false);
+ ret = __subflow_push_pending(sk, ssk, &info);
+ first = false;
if (ret <= 0)
- goto out;
+ break;
+ copied += ret;
+ continue;
+ }
+
+ if (mptcp_sched_get_send(msk))
+ goto out;
- info.sent += ret;
+ if (READ_ONCE(subflow->scheduled)) {
+ mptcp_subflow_set_scheduled(subflow, false);
+ ret = __subflow_push_pending(sk, ssk, &info);
+ if (ret <= 0)
+ keep_pushing = false;
copied += ret;
- len -= ret;
- first = false;
+ }
- mptcp_update_post_push(msk, dfrag, ret);
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ xmit_ssk = mptcp_subflow_tcp_sock(subflow);
+ if (xmit_ssk != ssk) {
+ mptcp_subflow_delegate(subflow,
+ MPTCP_DELEGATE_SEND);
+ keep_pushing = false;
+ }
+ }
}
- WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
}
out:
@@ -1644,8 +1739,8 @@ out:
if (copied) {
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
- if (!mptcp_timer_pending(sk))
- mptcp_reset_timer(sk);
+ if (!mptcp_rtx_timer_pending(sk))
+ mptcp_reset_rtx_timer(sk);
if (msk->snd_data_fin_enable &&
msk->snd_nxt + 1 == msk->write_seq)
@@ -1653,27 +1748,34 @@ out:
}
}
-static void mptcp_set_nospace(struct sock *sk)
-{
- /* enable autotune */
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-
- /* will be cleared on avail space */
- set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
-}
-
static int mptcp_disconnect(struct sock *sk, int flags);
-static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg,
+static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
size_t len, int *copied_syn)
{
unsigned int saved_flags = msg->msg_flags;
struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sock *ssk;
int ret;
+ /* on flags based fastopen the mptcp is supposed to create the
+ * first subflow right now. Otherwise we are in the defer_connect
+ * path, and the first subflow must be already present.
+ * Since the defer_connect flag is cleared after the first succsful
+ * fastopen attempt, no need to check for additional subflow status.
+ */
+ if (msg->msg_flags & MSG_FASTOPEN) {
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk))
+ return PTR_ERR(ssk);
+ }
+ if (!msk->first)
+ return -EINVAL;
+
+ ssk = msk->first;
+
lock_sock(ssk);
msg->msg_flags |= MSG_DONTWAIT;
- msk->connect_flags = O_NONBLOCK;
msk->fastopening = 1;
ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL);
msk->fastopening = 0;
@@ -1691,17 +1793,75 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh
if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR)
*copied_syn = 0;
} else if (ret && ret != -EINPROGRESS) {
- mptcp_disconnect(sk, 0);
+ /* The disconnect() op called by tcp_sendmsg_fastopen()/
+ * __inet_stream_connect() can fail, due to looking check,
+ * see mptcp_disconnect().
+ * Attempt it again outside the problematic scope.
+ */
+ if (!mptcp_disconnect(sk, 0)) {
+ sk->sk_disconnects++;
+ sk->sk_socket->state = SS_UNCONNECTED;
+ }
}
+ inet_clear_bit(DEFER_CONNECT, sk);
return ret;
}
+static int do_copy_data_nocache(struct sock *sk, int copy,
+ struct iov_iter *from, char *to)
+{
+ if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
+ if (!copy_from_iter_full_nocache(to, copy, from))
+ return -EFAULT;
+ } else if (!copy_from_iter_full(to, copy, from)) {
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/* open-code sk_stream_memory_free() plus sent limit computation to
+ * avoid indirect calls in fast-path.
+ * Called under the msk socket lock, so we can avoid a bunch of ONCE
+ * annotations.
+ */
+static u32 mptcp_send_limit(const struct sock *sk)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+ u32 limit, not_sent;
+
+ if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf))
+ return 0;
+
+ limit = mptcp_notsent_lowat(sk);
+ if (limit == UINT_MAX)
+ return UINT_MAX;
+
+ not_sent = msk->write_seq - msk->snd_nxt;
+ if (not_sent >= limit)
+ return 0;
+
+ return limit - not_sent;
+}
+
+static void mptcp_rps_record_subflows(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (!rfs_is_needed())
+ return;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ sock_rps_record_flow(ssk);
+ }
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct page_frag *pfrag;
- struct socket *ssock;
size_t copied = 0;
int ret = 0;
long timeo;
@@ -1711,12 +1871,13 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (unlikely(ssock && (inet_sk(ssock->sk)->defer_connect ||
- msg->msg_flags & MSG_FASTOPEN))) {
+ mptcp_rps_record_subflows(msk);
+
+ if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
+ msg->msg_flags & MSG_FASTOPEN)) {
int copied_syn = 0;
- ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn);
+ ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn);
copied += copied_syn;
if (ret == -EINPROGRESS && copied_syn > 0)
goto out;
@@ -1743,6 +1904,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct mptcp_data_frag *dfrag;
bool dfrag_collapsed;
size_t psize, offset;
+ u32 copy_limit;
+
+ /* ensure fitting the notsent_lowat() constraint */
+ copy_limit = mptcp_send_limit(sk);
+ if (!copy_limit)
+ goto wait_for_memory;
/* reuse tail pfrag, if possible, or carve a new one from the
* page allocator
@@ -1750,9 +1917,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
dfrag = mptcp_pending_tail(sk);
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
if (!dfrag_collapsed) {
- if (!sk_stream_memory_free(sk))
- goto wait_for_memory;
-
if (!mptcp_page_frag_refill(sk, pfrag))
goto wait_for_memory;
@@ -1767,19 +1931,19 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
offset = dfrag->offset + dfrag->data_len;
psize = pfrag->size - offset;
psize = min_t(size_t, psize, msg_data_left(msg));
+ psize = min_t(size_t, psize, copy_limit);
total_ts = psize + frag_truesize;
if (!sk_wmem_schedule(sk, total_ts))
goto wait_for_memory;
- if (copy_page_from_iter(dfrag->page, offset, psize,
- &msg->msg_iter) != psize) {
- ret = -EFAULT;
+ ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
+ page_address(dfrag->page) + offset);
+ if (ret)
goto do_error;
- }
/* data successfully copied into the write queue */
- sk->sk_forward_alloc -= total_ts;
+ sk_forward_alloc_add(sk, -total_ts);
copied += psize;
dfrag->data_len += psize;
frag_truesize += psize;
@@ -1794,16 +1958,16 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
get_page(dfrag->page);
list_add_tail(&dfrag->list, &msk->rtx_queue);
if (!msk->first_pending)
- WRITE_ONCE(msk->first_pending, dfrag);
+ msk->first_pending = dfrag;
}
- pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk,
+ pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk,
dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
!dfrag_collapsed);
continue;
wait_for_memory:
- mptcp_set_nospace(sk);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
__mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
@@ -1825,21 +1989,38 @@ do_error:
goto out;
}
-static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
- struct msghdr *msg,
- size_t len, int flags,
+static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied);
+
+static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
+ size_t len, int flags, int copied_total,
struct scm_timestamping_internal *tss,
int *cmsg_flags)
{
+ struct mptcp_sock *msk = mptcp_sk(sk);
struct sk_buff *skb, *tmp;
+ int total_data_len = 0;
int copied = 0;
- skb_queue_walk_safe(&msk->receive_queue, skb, tmp) {
- u32 offset = MPTCP_SKB_CB(skb)->offset;
+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
+ u32 delta, offset = MPTCP_SKB_CB(skb)->offset;
u32 data_len = skb->len - offset;
- u32 count = min_t(size_t, len - copied, data_len);
+ u32 count;
int err;
+ if (flags & MSG_PEEK) {
+ /* skip already peeked skbs */
+ if (total_data_len + data_len <= copied_total) {
+ total_data_len += data_len;
+ continue;
+ }
+
+ /* skip the already peeked data in the current skb */
+ delta = copied_total - total_data_len;
+ offset += delta;
+ data_len -= delta;
+ }
+
+ count = min_t(size_t, len - copied, data_len);
if (!(flags & MSG_TRUNC)) {
err = skb_copy_datagram_msg(skb, offset, msg, count);
if (unlikely(err < 0)) {
@@ -1856,26 +2037,28 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
copied += count;
- if (count < data_len) {
- if (!(flags & MSG_PEEK)) {
+ if (!(flags & MSG_PEEK)) {
+ msk->bytes_consumed += count;
+ if (count < data_len) {
MPTCP_SKB_CB(skb)->offset += count;
MPTCP_SKB_CB(skb)->map_seq += count;
+ break;
}
- break;
- }
- if (!(flags & MSG_PEEK)) {
- /* we will bulk release the skb memory later */
+ /* avoid the indirect call, we know the destructor is sock_rfree */
skb->destructor = NULL;
- WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
- __skb_unlink(skb, &msk->receive_queue);
- __kfree_skb(skb);
+ skb->sk = NULL;
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, skb->truesize);
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ skb_attempt_defer_free(skb);
}
if (copied >= len)
break;
}
+ mptcp_rcv_space_adjust(msk, copied);
return copied;
}
@@ -1887,14 +2070,18 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
+ u8 scaling_ratio = U8_MAX;
u32 time, advmss = 1;
u64 rtt_us, mstamp;
- sock_owned_by_me(sk);
+ msk_owned_by_me(msk);
if (copied <= 0)
return;
+ if (!msk->rcvspace_init)
+ mptcp_rcv_space_init(msk, msk->first);
+
msk->rcvq_space.copied += copied;
mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
@@ -1917,126 +2104,115 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
rtt_us = max(sf_rtt_us, rtt_us);
advmss = max(sf_advmss, advmss);
+ scaling_ratio = min(tp->scaling_ratio, scaling_ratio);
}
msk->rcvq_space.rtt_us = rtt_us;
+ msk->scaling_ratio = scaling_ratio;
if (time < (rtt_us >> 3) || rtt_us == 0)
return;
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
- if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvmem, rcvbuf;
- u64 rcvwin, grow;
-
- rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
-
- grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
-
- do_div(grow, msk->rcvq_space.space);
- rcvwin += (grow << 1);
-
- rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
- while (tcp_win_from_space(sk, rcvmem) < advmss)
- rcvmem += 128;
-
- do_div(rcvwin, advmss);
- rcvbuf = min_t(u64, rcvwin * rcvmem,
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
-
- if (rcvbuf > sk->sk_rcvbuf) {
- u32 window_clamp;
-
- window_clamp = tcp_win_from_space(sk, rcvbuf);
- WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
-
- /* Make subflows follow along. If we do not do this, we
- * get drops at subflow level if skbs can't be moved to
- * the mptcp rx queue fast enough (announced rcv_win can
- * exceed ssk->sk_rcvbuf).
- */
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk;
- bool slow;
-
- ssk = mptcp_subflow_tcp_sock(subflow);
- slow = lock_sock_fast(ssk);
- WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
- tcp_sk(ssk)->window_clamp = window_clamp;
- tcp_cleanup_rbuf(ssk, 1);
- unlock_sock_fast(ssk, slow);
- }
+ if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
+ /* Make subflows follow along. If we do not do this, we
+ * get drops at subflow level if skbs can't be moved to
+ * the mptcp rx queue fast enough (announced rcv_win can
+ * exceed ssk->sk_rcvbuf).
+ */
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk;
+ bool slow;
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ slow = lock_sock_fast(ssk);
+ /* subflows can be added before tcp_init_transfer() */
+ if (tcp_sk(ssk)->rcvq_space.space)
+ tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied);
+ unlock_sock_fast(ssk, slow);
}
}
- msk->rcvq_space.space = msk->rcvq_space.copied;
new_measure:
msk->rcvq_space.copied = 0;
msk->rcvq_space.time = mstamp;
}
-static void __mptcp_update_rmem(struct sock *sk)
+static bool __mptcp_move_skbs(struct sock *sk, struct list_head *skbs, u32 *delta)
{
+ struct sk_buff *skb = list_first_entry(skbs, struct sk_buff, list);
struct mptcp_sock *msk = mptcp_sk(sk);
+ bool moved = false;
- if (!msk->rmem_released)
- return;
+ *delta = 0;
+ while (1) {
+ /* If the msk recvbuf is full stop, don't drop */
+ if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
+ break;
+
+ prefetch(skb->next);
+ list_del(&skb->list);
+ *delta += skb->truesize;
- atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
- mptcp_rmem_uncharge(sk, msk->rmem_released);
- WRITE_ONCE(msk->rmem_released, 0);
+ moved |= __mptcp_move_skb(sk, skb);
+ if (list_empty(skbs))
+ break;
+
+ skb = list_first_entry(skbs, struct sk_buff, list);
+ }
+
+ __mptcp_ofo_queue(msk);
+ if (moved)
+ mptcp_check_data_fin((struct sock *)msk);
+ return moved;
}
-static void __mptcp_splice_receive_queue(struct sock *sk)
+static bool mptcp_can_spool_backlog(struct sock *sk, struct list_head *skbs)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
+ /* After CG initialization, subflows should never add skb before
+ * gaining the CG themself.
+ */
+ DEBUG_NET_WARN_ON_ONCE(msk->backlog_unaccounted && sk->sk_socket &&
+ mem_cgroup_from_sk(sk));
+
+ /* Don't spool the backlog if the rcvbuf is full. */
+ if (list_empty(&msk->backlog_list) ||
+ sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
+ return false;
+
+ INIT_LIST_HEAD(skbs);
+ list_splice_init(&msk->backlog_list, skbs);
+ return true;
}
-static bool __mptcp_move_skbs(struct mptcp_sock *msk)
+static void mptcp_backlog_spooled(struct sock *sk, u32 moved,
+ struct list_head *skbs)
{
- struct sock *sk = (struct sock *)msk;
- unsigned int moved = 0;
- bool ret, done;
+ struct mptcp_sock *msk = mptcp_sk(sk);
- do {
- struct sock *ssk = mptcp_subflow_recv_lookup(msk);
- bool slowpath;
+ WRITE_ONCE(msk->backlog_len, msk->backlog_len - moved);
+ list_splice(skbs, &msk->backlog_list);
+}
- /* we can have data pending in the subflows only if the msk
- * receive buffer was full at subflow_data_ready() time,
- * that is an unlikely slow path.
- */
- if (likely(!ssk))
- break;
+static bool mptcp_move_skbs(struct sock *sk)
+{
+ struct list_head skbs;
+ bool enqueued = false;
+ u32 moved;
- slowpath = lock_sock_fast(ssk);
- mptcp_data_lock(sk);
- __mptcp_update_rmem(sk);
- done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ mptcp_data_lock(sk);
+ while (mptcp_can_spool_backlog(sk, &skbs)) {
mptcp_data_unlock(sk);
+ enqueued |= __mptcp_move_skbs(sk, &skbs, &moved);
- if (unlikely(ssk->sk_err))
- __mptcp_error_report(sk);
- unlock_sock_fast(ssk, slowpath);
- } while (!done);
-
- /* acquire the data lock only if some input data is pending */
- ret = moved > 0;
- if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) ||
- !skb_queue_empty_lockless(&sk->sk_receive_queue)) {
mptcp_data_lock(sk);
- __mptcp_update_rmem(sk);
- ret |= __mptcp_ofo_queue(msk);
- __mptcp_splice_receive_queue(sk);
- mptcp_data_unlock(sk);
+ mptcp_backlog_spooled(sk, moved, &skbs);
}
- if (ret)
- mptcp_check_data_fin((struct sock *)msk);
- return !skb_queue_empty(&msk->receive_queue);
+ mptcp_data_unlock(sk);
+ return enqueued;
}
static unsigned int mptcp_inq_hint(const struct sock *sk)
@@ -2044,9 +2220,9 @@ static unsigned int mptcp_inq_hint(const struct sock *sk)
const struct mptcp_sock *msk = mptcp_sk(sk);
const struct sk_buff *skb;
- skb = skb_peek(&msk->receive_queue);
+ skb = skb_peek(&sk->sk_receive_queue);
if (skb) {
- u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+ u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq;
if (hint_val >= INT_MAX)
return INT_MAX;
@@ -2079,6 +2255,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out_err;
}
+ mptcp_rps_record_subflows(msk);
+
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
len = min_t(size_t, len, INT_MAX);
@@ -2088,9 +2266,10 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
cmsg_flags = MPTCP_CMSG_INQ;
while (copied < len) {
- int bytes_read;
+ int err, bytes_read;
- bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags);
+ bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags,
+ copied, &tss, &cmsg_flags);
if (unlikely(bytes_read < 0)) {
if (!copied)
copied = bytes_read;
@@ -2099,13 +2278,10 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
copied += bytes_read;
- /* be sure to advertise window change */
- mptcp_cleanup_rbuf(msk);
-
- if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk))
+ if (!list_empty(&msk->backlog_list) && mptcp_move_skbs(sk))
continue;
- /* only the master socket status is relevant here. The exit
+ /* only the MPTCP socket status is relevant here. The exit
* conditions mirror closely tcp_recvmsg()
*/
if (copied >= target)
@@ -2124,17 +2300,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
break;
}
- if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
- mptcp_check_for_eof(msk);
-
- if (sk->sk_shutdown & RCV_SHUTDOWN) {
- /* race breaker: the shutdown could be after the
- * previous receive queue check
- */
- if (__mptcp_move_skbs(msk))
- continue;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
- }
if (sk->sk_state == TCP_CLOSE) {
copied = -ENOTCONN;
@@ -2152,10 +2319,17 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
}
}
- pr_debug("block timeout %ld", timeo);
- sk_wait_data(sk, &timeo, NULL);
+ pr_debug("block timeout %ld\n", timeo);
+ mptcp_cleanup_rbuf(msk, copied);
+ err = sk_wait_data(sk, &timeo, NULL);
+ if (err < 0) {
+ err = copied ? : err;
+ goto out_err;
+ }
}
+ mptcp_cleanup_rbuf(msk, copied);
+
out_err:
if (cmsg_flags && copied >= 0) {
if (cmsg_flags & MPTCP_CMSG_TS)
@@ -2168,11 +2342,8 @@ out_err:
}
}
- pr_debug("msk=%p rx queue empty=%d:%d copied=%d",
- msk, skb_queue_empty_lockless(&sk->sk_receive_queue),
- skb_queue_empty(&msk->receive_queue), copied);
- if (!(flags & MSG_PEEK))
- mptcp_rcv_space_adjust(msk, copied);
+ pr_debug("msk=%p rx queue empty=%d copied=%d\n",
+ msk, skb_queue_empty(&sk->sk_receive_queue), copied);
release_sock(sk);
return copied;
@@ -2180,9 +2351,7 @@ out_err:
static void mptcp_retransmit_timer(struct timer_list *t)
{
- struct inet_connection_sock *icsk = from_timer(icsk, t,
- icsk_retransmit_timer);
- struct sock *sk = &icsk->icsk_inet.sk;
+ struct sock *sk = timer_container_of(sk, t, mptcp_retransmit_timer);
struct mptcp_sock *msk = mptcp_sk(sk);
bh_lock_sock(sk);
@@ -2198,9 +2367,11 @@ static void mptcp_retransmit_timer(struct timer_list *t)
sock_put(sk);
}
-static void mptcp_timeout_timer(struct timer_list *t)
+static void mptcp_tout_timer(struct timer_list *t)
{
- struct sock *sk = from_timer(sk, t, sk_timer);
+ struct inet_connection_sock *icsk =
+ timer_container_of(icsk, t, mptcp_tout_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
mptcp_schedule_work(sk);
sock_put(sk);
@@ -2211,17 +2382,12 @@ static void mptcp_timeout_timer(struct timer_list *t)
*
* A backup subflow is returned only if that is the only kind available.
*/
-static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
+struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
{
struct sock *backup = NULL, *pick = NULL;
struct mptcp_subflow_context *subflow;
int min_stale_count = INT_MAX;
- sock_owned_by_me((const struct sock *)msk);
-
- if (__mptcp_check_fallback(msk))
- return NULL;
-
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -2235,7 +2401,7 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
continue;
}
- if (subflow->backup) {
+ if (subflow->backup || subflow->request_bkup) {
if (!backup)
backup = ssk;
continue;
@@ -2252,14 +2418,6 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
return min_stale_count > 1 ? backup : NULL;
}
-static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk)
-{
- if (msk->subflow) {
- iput(SOCK_INODE(msk->subflow));
- msk->subflow = NULL;
- }
-}
-
bool __mptcp_retransmit_pending_data(struct sock *sk)
{
struct mptcp_data_frag *cur, *rtx_head;
@@ -2268,9 +2426,6 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
if (__mptcp_check_fallback(msk))
return false;
- if (tcp_rtx_and_write_queues_empty(sk))
- return false;
-
/* the closing socket has some data untransmitted and/or unacked:
* some data in the mptcp rtx queue has not really xmitted yet.
* keep it simple and re-inject the whole mptcp level rtx queue
@@ -2302,7 +2457,26 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
/* flags for __mptcp_close_ssk() */
#define MPTCP_CF_PUSH BIT(1)
-#define MPTCP_CF_FASTCLOSE BIT(2)
+
+/* be sure to send a reset only if the caller asked for it, also
+ * clean completely the subflow status when the subflow reaches
+ * TCP_CLOSE state
+ */
+static void __mptcp_subflow_disconnect(struct sock *ssk,
+ struct mptcp_subflow_context *subflow,
+ unsigned int flags)
+{
+ if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
+ subflow->send_fastclose) {
+ /* The MPTCP code never wait on the subflow sockets, TCP-level
+ * disconnect should never fail
+ */
+ WARN_ON_ONCE(tcp_disconnect(ssk, 0));
+ mptcp_subflow_ctx_reset(subflow);
+ } else {
+ tcp_shutdown(ssk, SEND_SHUTDOWN);
+ }
+}
/* subflow sockets can be either outgoing (connect) or incoming
* (accept).
@@ -2317,34 +2491,56 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
unsigned int flags)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- bool need_push, dispose_it;
-
- dispose_it = !msk->subflow || ssk != msk->subflow->sk;
- if (dispose_it)
- list_del(&subflow->node);
+ bool dispose_it, need_push = false;
+ int fwd_remaining;
+ /* Do not pass RX data to the msk, even if the subflow socket is not
+ * going to be freed (i.e. even for the first subflow on graceful
+ * subflow close.
+ */
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
+ subflow->closing = 1;
- if (flags & MPTCP_CF_FASTCLOSE) {
- /* be sure to force the tcp_disconnect() path,
- * to generate the egress reset
- */
- ssk->sk_lingertime = 0;
- sock_set_flag(ssk, SOCK_LINGER);
- subflow->send_fastclose = 1;
+ /* Borrow the fwd allocated page left-over; fwd memory for the subflow
+ * could be negative at this point, but will be reach zero soon - when
+ * the data allocated using such fragment will be freed.
+ */
+ if (subflow->lent_mem_frag) {
+ fwd_remaining = PAGE_SIZE - subflow->lent_mem_frag;
+ sk_forward_alloc_add(sk, fwd_remaining);
+ sk_forward_alloc_add(ssk, -fwd_remaining);
+ subflow->lent_mem_frag = 0;
}
+ /* If the first subflow moved to a close state before accept, e.g. due
+ * to an incoming reset or listener shutdown, the subflow socket is
+ * already deleted by inet_child_forget() and the mptcp socket can't
+ * survive too.
+ */
+ if (msk->in_accept_queue && msk->first == ssk &&
+ (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
+ /* ensure later check in mptcp_worker() will dispose the msk */
+ sock_set_flag(sk, SOCK_DEAD);
+ mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1));
+ mptcp_subflow_drop_ctx(ssk);
+ goto out_release;
+ }
+
+ dispose_it = msk->free_first || ssk != msk->first;
+ if (dispose_it)
+ list_del(&subflow->node);
+
+ if (subflow->send_fastclose && ssk->sk_state != TCP_CLOSE)
+ tcp_set_state(ssk, TCP_CLOSE);
+
need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
if (!dispose_it) {
- tcp_disconnect(ssk, 0);
- msk->subflow->state = SS_UNCONNECTED;
- mptcp_subflow_ctx_reset(subflow);
+ __mptcp_subflow_disconnect(ssk, subflow, flags);
release_sock(ssk);
goto out;
}
- sock_orphan(ssk);
subflow->disposable = 1;
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
@@ -2352,45 +2548,77 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
* reference owned by msk;
*/
if (!inet_csk(ssk)->icsk_ulp_ops) {
+ WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD));
kfree_rcu(subflow, rcu);
} else {
/* otherwise tcp will dispose of the ssk and subflow ctx */
- if (ssk->sk_state == TCP_LISTEN) {
- tcp_set_state(ssk, TCP_CLOSE);
- mptcp_subflow_queue_clean(sk, ssk);
- inet_csk_listen_stop(ssk);
- mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
- }
__tcp_close(ssk, 0);
/* close acquired an extra ref */
__sock_put(ssk);
}
+
+out_release:
+ __mptcp_subflow_error_report(sk, ssk);
release_sock(ssk);
sock_put(ssk);
if (ssk == msk->first)
- msk->first = NULL;
+ WRITE_ONCE(msk->first, NULL);
out:
- if (ssk == msk->last_snd)
- msk->last_snd = NULL;
-
+ __mptcp_sync_sndbuf(sk);
if (need_push)
__mptcp_push_pending(sk, 0);
+
+ /* Catch every 'all subflows closed' scenario, including peers silently
+ * closing them, e.g. due to timeout.
+ * For established sockets, allow an additional timeout before closing,
+ * as the protocol can still create more subflows.
+ */
+ if (list_is_singular(&msk->conn_list) && msk->first &&
+ inet_sk_state_load(msk->first) == TCP_CLOSE) {
+ if (sk->sk_state != TCP_ESTABLISHED ||
+ msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) {
+ mptcp_set_state(sk, TCP_CLOSE);
+ mptcp_close_wake_up(sk);
+ } else {
+ mptcp_start_tout_timer(sk);
+ }
+ }
}
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow)
{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+
+ /* The first subflow can already be closed and still in the list */
+ if (subflow->close_event_done)
+ return;
+
+ subflow->close_event_done = true;
+
if (sk->sk_state == TCP_ESTABLISHED)
mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
+ /* Remove any reference from the backlog to this ssk; backlog skbs consume
+ * space in the msk receive queue, no need to touch sk->sk_rmem_alloc
+ */
+ list_for_each_entry(skb, &msk->backlog_list, list) {
+ if (skb->sk != ssk)
+ continue;
+
+ atomic_sub(skb->truesize, &skb->sk->sk_rmem_alloc);
+ skb->sk = NULL;
+ }
+
/* subflow aborted before reaching the fully_established status
* attempt the creation of the next subflow
*/
- mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow);
+ mptcp_pm_subflow_check_next(mptcp_sk(sk), subflow);
__mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH);
}
@@ -2400,43 +2628,40 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
return 0;
}
-static void __mptcp_close_subflow(struct mptcp_sock *msk)
+static void __mptcp_close_subflow(struct sock *sk)
{
struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_sock *msk = mptcp_sk(sk);
might_sleep();
mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int ssk_state = inet_sk_state_load(ssk);
- if (inet_sk_state_load(ssk) != TCP_CLOSE)
+ if (ssk_state != TCP_CLOSE &&
+ (ssk_state != TCP_CLOSE_WAIT ||
+ inet_sk_state_load(sk) != TCP_ESTABLISHED ||
+ __mptcp_check_fallback(msk)))
continue;
/* 'subflow_data_ready' will re-sched once rx queue is empty */
if (!skb_queue_empty_lockless(&ssk->sk_receive_queue))
continue;
- mptcp_close_ssk((struct sock *)msk, ssk, subflow);
+ mptcp_close_ssk(sk, ssk, subflow);
}
+
}
-static bool mptcp_check_close_timeout(const struct sock *sk)
+static bool mptcp_close_tout_expired(const struct sock *sk)
{
- s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp;
- struct mptcp_subflow_context *subflow;
-
- if (delta >= TCP_TIMEWAIT_LEN)
- return true;
+ if (!inet_csk(sk)->icsk_mtup.probe_timestamp ||
+ sk->sk_state == TCP_CLOSE)
+ return false;
- /* if all subflows are in closed status don't bother with additional
- * timeout
- */
- mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
- if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) !=
- TCP_CLOSE)
- return false;
- }
- return true;
+ return time_after32(tcp_jiffies32,
+ inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk));
}
static void mptcp_check_fastclose(struct mptcp_sock *msk)
@@ -2455,7 +2680,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
slow = lock_sock_fast(tcp_sk);
if (tcp_sk->sk_state != TCP_CLOSE) {
- tcp_send_active_reset(tcp_sk, GFP_ATOMIC);
+ mptcp_send_active_reset_reason(tcp_sk);
tcp_set_state(tcp_sk, TCP_CLOSE);
}
unlock_sock_fast(tcp_sk, slow);
@@ -2464,19 +2689,19 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
/* Mirror the tcp_reset() error propagation */
switch (sk->sk_state) {
case TCP_SYN_SENT:
- sk->sk_err = ECONNREFUSED;
+ WRITE_ONCE(sk->sk_err, ECONNREFUSED);
break;
case TCP_CLOSE_WAIT:
- sk->sk_err = EPIPE;
+ WRITE_ONCE(sk->sk_err, EPIPE);
break;
case TCP_CLOSE:
return;
default:
- sk->sk_err = ECONNRESET;
+ WRITE_ONCE(sk->sk_err, ECONNRESET);
}
- inet_sk_state_store(sk, TCP_CLOSE);
- sk->sk_shutdown = SHUTDOWN_MASK;
+ mptcp_set_state(sk, TCP_CLOSE);
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);
@@ -2490,23 +2715,25 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
static void __mptcp_retrans(struct sock *sk)
{
+ struct mptcp_sendmsg_info info = { .data_lock_held = true, };
struct mptcp_sock *msk = mptcp_sk(sk);
- struct mptcp_sendmsg_info info = {};
+ struct mptcp_subflow_context *subflow;
struct mptcp_data_frag *dfrag;
- size_t copied = 0;
struct sock *ssk;
- int ret;
+ int ret, err;
+ u16 len = 0;
mptcp_clean_una_wakeup(sk);
/* first check ssk: need to kick "stale" logic */
- ssk = mptcp_subflow_get_retrans(msk);
+ err = mptcp_sched_get_retrans(msk);
dfrag = mptcp_rtx_head(sk);
if (!dfrag) {
if (mptcp_data_fin_enabled(msk)) {
struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_retransmits++;
+ WRITE_ONCE(icsk->icsk_retransmits,
+ icsk->icsk_retransmits + 1);
mptcp_set_datafin_timeout(sk);
mptcp_send_ack(msk);
@@ -2514,63 +2741,100 @@ static void __mptcp_retrans(struct sock *sk)
}
if (!mptcp_send_head(sk))
- return;
+ goto clear_scheduled;
goto reset_timer;
}
- if (!ssk)
+ if (err)
goto reset_timer;
- lock_sock(ssk);
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ u16 copied = 0;
- /* limit retransmission to the bytes already sent on some subflows */
- info.sent = 0;
- info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent;
- while (info.sent < info.limit) {
- ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
- if (ret <= 0)
- break;
+ mptcp_subflow_set_scheduled(subflow, false);
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
- copied += ret;
- info.sent += ret;
- }
- if (copied) {
- dfrag->already_sent = max(dfrag->already_sent, info.sent);
- tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
- info.size_goal);
- WRITE_ONCE(msk->allow_infinite_fallback, false);
+ ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+
+ /* limit retransmission to the bytes already sent on some subflows */
+ info.sent = 0;
+ info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
+ dfrag->already_sent;
+
+ /*
+ * make the whole retrans decision, xmit, disallow
+ * fallback atomic
+ */
+ spin_lock_bh(&msk->fallback_lock);
+ if (__mptcp_check_fallback(msk)) {
+ spin_unlock_bh(&msk->fallback_lock);
+ release_sock(ssk);
+ goto clear_scheduled;
+ }
+
+ while (info.sent < info.limit) {
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ if (ret <= 0)
+ break;
+
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
+ copied += ret;
+ info.sent += ret;
+ }
+ if (copied) {
+ len = max(copied, len);
+ tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+ info.size_goal);
+ msk->allow_infinite_fallback = false;
+ }
+ spin_unlock_bh(&msk->fallback_lock);
+
+ release_sock(ssk);
+ }
}
- release_sock(ssk);
+ msk->bytes_retrans += len;
+ dfrag->already_sent = max(dfrag->already_sent, len);
reset_timer:
mptcp_check_and_set_pending(sk);
- if (!mptcp_timer_pending(sk))
- mptcp_reset_timer(sk);
+ if (!mptcp_rtx_timer_pending(sk))
+ mptcp_reset_rtx_timer(sk);
+
+clear_scheduled:
+ /* If no rtx data was available or in case of fallback, there
+ * could be left-over scheduled subflows; clear them all
+ * or later xmit could use bad ones
+ */
+ mptcp_for_each_subflow(msk, subflow)
+ if (READ_ONCE(subflow->scheduled))
+ mptcp_subflow_set_scheduled(subflow, false);
}
/* schedule the timeout timer for the relevant event: either close timeout
* or mp_fail timeout. The close timeout takes precedence on the mp_fail one
*/
-void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout)
+void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
{
struct sock *sk = (struct sock *)msk;
unsigned long timeout, close_timeout;
- if (!fail_tout && !sock_flag(sk, SOCK_DEAD))
+ if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp)
return;
- close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN;
+ close_timeout = (unsigned long)inet_csk(sk)->icsk_mtup.probe_timestamp -
+ tcp_jiffies32 + jiffies + mptcp_close_timeout(sk);
/* the close timeout takes precedence on the fail one, and here at least one of
* them is active
*/
- timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout;
+ timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout;
- sk_reset_timer(sk, &sk->sk_timer, timeout);
+ sk_reset_timer(sk, &inet_csk(sk)->mptcp_tout_timer, timeout);
}
static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
@@ -2581,14 +2845,30 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
if (!ssk)
return;
- pr_debug("MP_FAIL doesn't respond, reset the subflow");
+ pr_debug("MP_FAIL doesn't respond, reset the subflow\n");
slow = lock_sock_fast(ssk);
mptcp_subflow_reset(ssk);
WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
unlock_sock_fast(ssk, slow);
+}
+
+static void mptcp_backlog_purge(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *tmp, *skb;
+ LIST_HEAD(backlog);
+
+ mptcp_data_lock(sk);
+ list_splice_init(&msk->backlog_list, &backlog);
+ msk->backlog_len = 0;
+ mptcp_data_unlock(sk);
- mptcp_reset_timeout(msk, 0);
+ list_for_each_entry_safe(skb, tmp, &backlog, list) {
+ mptcp_borrow_fwdmem(sk, skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
+ }
+ sk_mem_reclaim(sk);
}
static void mptcp_do_fastclose(struct sock *sk)
@@ -2596,9 +2876,34 @@ static void mptcp_do_fastclose(struct sock *sk)
struct mptcp_subflow_context *subflow, *tmp;
struct mptcp_sock *msk = mptcp_sk(sk);
- mptcp_for_each_subflow_safe(msk, subflow, tmp)
- __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow),
- subflow, MPTCP_CF_FASTCLOSE);
+ mptcp_set_state(sk, TCP_CLOSE);
+ mptcp_backlog_purge(sk);
+
+ /* Explicitly send the fastclose reset as need */
+ if (__mptcp_check_fallback(msk))
+ return;
+
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+
+ /* Some subflow socket states don't allow/need a reset.*/
+ if ((1 << ssk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+ goto unlock;
+
+ subflow->send_fastclose = 1;
+
+ /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
+ * issue in __tcp_select_window(), see tcp_disconnect().
+ */
+ inet_csk(ssk)->icsk_ack.rcv_mss = TCP_MIN_MSS;
+
+ tcp_send_active_reset(ssk, ssk->sk_allocation,
+ SK_RST_REASON_TCP_ABORT_ON_CLOSE);
+unlock:
+ release_sock(ssk);
+ }
}
static void mptcp_worker(struct work_struct *work)
@@ -2610,38 +2915,33 @@ static void mptcp_worker(struct work_struct *work)
lock_sock(sk);
state = sk->sk_state;
- if (unlikely(state == TCP_CLOSE))
+ if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto unlock;
- mptcp_check_data_fin_ack(sk);
-
mptcp_check_fastclose(msk);
- mptcp_pm_nl_work(msk);
-
- if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
- mptcp_check_for_eof(msk);
+ mptcp_pm_worker(msk);
- __mptcp_check_send_data_fin(sk);
+ mptcp_check_send_data_fin(sk);
+ mptcp_check_data_fin_ack(sk);
mptcp_check_data_fin(sk);
- /* There is no point in keeping around an orphaned sk timedout or
- * closed, but we need the msk around to reply to incoming DATA_FIN,
- * even if it is orphaned and in FIN_WAIT2 state
- */
- if (sock_flag(sk, SOCK_DEAD)) {
- if (mptcp_check_close_timeout(sk)) {
- inet_sk_state_store(sk, TCP_CLOSE);
- mptcp_do_fastclose(sk);
- }
- if (sk->sk_state == TCP_CLOSE) {
- __mptcp_destroy_sock(sk);
- goto unlock;
- }
+ if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+ __mptcp_close_subflow(sk);
+
+ if (mptcp_close_tout_expired(sk)) {
+ struct mptcp_subflow_context *subflow, *tmp;
+
+ mptcp_do_fastclose(sk);
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
+ __mptcp_close_ssk(sk, subflow->tcp_sock, subflow, 0);
+ mptcp_close_wake_up(sk);
}
- if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
- __mptcp_close_subflow(msk);
+ if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) {
+ __mptcp_destroy_sock(sk);
+ goto unlock;
+ }
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
__mptcp_retrans(sk);
@@ -2655,34 +2955,38 @@ unlock:
sock_put(sk);
}
-static int __mptcp_init_sock(struct sock *sk)
+static void __mptcp_init_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
INIT_LIST_HEAD(&msk->conn_list);
INIT_LIST_HEAD(&msk->join_list);
INIT_LIST_HEAD(&msk->rtx_queue);
+ INIT_LIST_HEAD(&msk->backlog_list);
INIT_WORK(&msk->work, mptcp_worker);
- __skb_queue_head_init(&msk->receive_queue);
msk->out_of_order_queue = RB_ROOT;
msk->first_pending = NULL;
- msk->rmem_fwd_alloc = 0;
- WRITE_ONCE(msk->rmem_released, 0);
msk->timer_ival = TCP_RTO_MIN;
+ msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
+ msk->backlog_len = 0;
- msk->first = NULL;
+ WRITE_ONCE(msk->first, NULL);
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
- WRITE_ONCE(msk->allow_infinite_fallback, true);
+ msk->allow_infinite_fallback = true;
+ msk->allow_subflows = true;
msk->recovery = false;
+ msk->subflow_id = 1;
+ msk->last_data_sent = tcp_jiffies32;
+ msk->last_data_recv = tcp_jiffies32;
+ msk->last_ack_recv = tcp_jiffies32;
mptcp_pm_data_init(msk);
+ spin_lock_init(&msk->fallback_lock);
/* re-use the csk retrans timer for MPTCP-level retrans */
- timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
- timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);
-
- return 0;
+ timer_setup(&sk->mptcp_retransmit_timer, mptcp_retransmit_timer, 0);
+ timer_setup(&msk->sk.mptcp_tout_timer, mptcp_tout_timer, 0);
}
static void mptcp_ca_reset(struct sock *sk)
@@ -2690,7 +2994,8 @@ static void mptcp_ca_reset(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_assign_congestion_control(sk);
- strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name);
+ strscpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name,
+ sizeof(mptcp_sk(sk)->ca_name));
/* no need to keep a reference to the ops, the name will suffice */
tcp_cleanup_congestion_control(sk);
@@ -2702,9 +3007,7 @@ static int mptcp_init_sock(struct sock *sk)
struct net *net = sock_net(sk);
int ret;
- ret = __mptcp_init_sock(sk);
- if (ret)
- return ret;
+ __mptcp_init_sock(sk);
if (!mptcp_is_enabled(net))
return -ENOPROTOOPT;
@@ -2712,7 +3015,10 @@ static int mptcp_init_sock(struct sock *sk)
if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
return -ENOMEM;
- ret = __mptcp_socket_create(mptcp_sk(sk));
+ rcu_read_lock();
+ ret = mptcp_init_sched(mptcp_sk(sk),
+ mptcp_sched_find(mptcp_get_scheduler(net)));
+ rcu_read_unlock();
if (ret)
return ret;
@@ -2724,8 +3030,8 @@ static int mptcp_init_sock(struct sock *sk)
mptcp_ca_reset(sk);
sk_sockets_allocated_inc(sk);
- sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
- sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
+ sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]);
+ sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]);
return 0;
}
@@ -2735,7 +3041,7 @@ static void __mptcp_clear_xmit(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
- WRITE_ONCE(msk->first_pending, NULL);
+ msk->first_pending = NULL;
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
dfrag_clear(sk, dfrag);
}
@@ -2758,18 +3064,24 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
break;
fallthrough;
case TCP_SYN_SENT:
- tcp_disconnect(ssk, O_NONBLOCK);
+ WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK));
break;
default:
if (__mptcp_check_fallback(mptcp_sk(sk))) {
- pr_debug("Fallback");
+ pr_debug("Fallback\n");
ssk->sk_shutdown |= how;
tcp_shutdown(ssk, how);
+
+ /* simulate the data_fin ack reception to let the state
+ * machine move forward
+ */
+ WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt);
+ mptcp_schedule_work(sk);
} else {
- pr_debug("Sending DATA_FIN on subflow %p", ssk);
+ pr_debug("Sending DATA_FIN on subflow %p\n", ssk);
tcp_send_ack(ssk);
- if (!mptcp_timer_pending(sk))
- mptcp_reset_timer(sk);
+ if (!mptcp_rtx_timer_pending(sk))
+ mptcp_reset_rtx_timer(sk);
}
break;
}
@@ -2777,6 +3089,29 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
release_sock(ssk);
}
+void mptcp_set_state(struct sock *sk, int state)
+{
+ int oldstate = sk->sk_state;
+
+ switch (state) {
+ case TCP_ESTABLISHED:
+ if (oldstate != TCP_ESTABLISHED)
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB);
+ break;
+ case TCP_CLOSE_WAIT:
+ /* Unlike TCP, MPTCP sk would not have the TCP_SYN_RECV state:
+ * MPTCP "accepted" sockets will be created later on. So no
+ * transition from TCP_SYN_RECV to TCP_CLOSE_WAIT.
+ */
+ break;
+ default:
+ if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT)
+ MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB);
+ }
+
+ inet_sk_state_store(sk, state);
+}
+
static const unsigned char new_state[16] = {
/* current state: new state: action: */
[0 /* (Invalid) */] = TCP_CLOSE,
@@ -2799,17 +3134,17 @@ static int mptcp_close_state(struct sock *sk)
int next = (int)new_state[sk->sk_state];
int ns = next & TCP_STATE_MASK;
- inet_sk_state_store(sk, ns);
+ mptcp_set_state(sk, ns);
return next & TCP_ACTION_FIN;
}
-static void __mptcp_check_send_data_fin(struct sock *sk)
+static void mptcp_check_send_data_fin(struct sock *sk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
- pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu",
+ pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu\n",
msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk),
msk->snd_nxt, msk->write_seq);
@@ -2822,19 +3157,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
WRITE_ONCE(msk->snd_nxt, msk->write_seq);
- /* fallback socket will not get data_fin/ack, can move to the next
- * state now
- */
- if (__mptcp_check_fallback(msk)) {
- WRITE_ONCE(msk->snd_una, msk->write_seq);
- if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
- inet_sk_state_store(sk, TCP_CLOSE);
- mptcp_close_wake_up(sk);
- } else if (sk->sk_state == TCP_FIN_WAIT1) {
- inet_sk_state_store(sk, TCP_FIN_WAIT2);
- }
- }
-
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
@@ -2846,7 +3168,7 @@ static void __mptcp_wr_shutdown(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d",
+ pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d\n",
msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state,
!!mptcp_send_head(sk));
@@ -2854,42 +3176,60 @@ static void __mptcp_wr_shutdown(struct sock *sk)
WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
WRITE_ONCE(msk->snd_data_fin_enable, 1);
- __mptcp_check_send_data_fin(sk);
+ mptcp_check_send_data_fin(sk);
}
static void __mptcp_destroy_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
might_sleep();
- mptcp_stop_timer(sk);
- sk_stop_timer(sk, &sk->sk_timer);
+ mptcp_stop_rtx_timer(sk);
+ sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer);
msk->pm.status = 0;
+ mptcp_release_sched(msk);
sk->sk_prot->destroy(sk);
- WARN_ON_ONCE(msk->rmem_fwd_alloc);
- WARN_ON_ONCE(msk->rmem_released);
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
- sk_refcnt_debug_release(sk);
sock_put(sk);
}
-static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+void __mptcp_unaccepted_force_close(struct sock *sk)
{
- /* Concurrent splices from sk_receive_queue into receive_queue will
- * always show at least one non-empty queue when checked in this order.
- */
- if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
- skb_queue_empty_lockless(&msk->receive_queue))
- return 0;
+ sock_set_flag(sk, SOCK_DEAD);
+ mptcp_do_fastclose(sk);
+ __mptcp_destroy_sock(sk);
+}
- return EPOLLIN | EPOLLRDNORM;
+static __poll_t mptcp_check_readable(struct sock *sk)
+{
+ return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0;
+}
+
+static void mptcp_check_listen_stop(struct sock *sk)
+{
+ struct sock *ssk;
+
+ if (inet_sk_state_load(sk) != TCP_LISTEN)
+ return;
+
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ ssk = mptcp_sk(sk)->first;
+ if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN))
+ return;
+
+ lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
+ tcp_set_state(ssk, TCP_CLOSE);
+ mptcp_subflow_queue_clean(sk, ssk);
+ inet_csk_listen_stop(ssk);
+ mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
+ release_sock(ssk);
}
bool __mptcp_close(struct sock *sk, long timeout)
@@ -2897,18 +3237,22 @@ bool __mptcp_close(struct sock *sk, long timeout)
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
bool do_cancel_work = false;
+ int subflows_alive = 0;
- sk->sk_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
- inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_check_listen_stop(sk);
+ mptcp_set_state(sk, TCP_CLOSE);
goto cleanup;
}
- if (mptcp_check_readable(msk)) {
- /* the msk has read data, do the MPTCP equivalent of TCP reset */
- inet_sk_state_store(sk, TCP_CLOSE);
+ if (mptcp_data_avail(msk) || timeout < 0) {
+ /* If the msk has read data, or the caller explicitly ask it,
+ * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
+ */
mptcp_do_fastclose(sk);
+ timeout = 0;
} else if (mptcp_close_state(sk)) {
__mptcp_wr_shutdown(sk);
}
@@ -2917,11 +3261,12 @@ bool __mptcp_close(struct sock *sk, long timeout)
cleanup:
/* orphan all the subflows */
- inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast_nested(ssk);
+ subflows_alive += ssk->sk_state != TCP_CLOSE;
+
/* since the close timeout takes precedence on the fail one,
* cancel the latter
*/
@@ -2937,16 +3282,21 @@ cleanup:
}
sock_orphan(sk);
+ /* all the subflows are closed, only timeout can change the msk
+ * state, let's not keep resources busy for no reasons
+ */
+ if (subflows_alive == 0)
+ mptcp_set_state(sk, TCP_CLOSE);
+
sock_hold(sk);
- pr_debug("msk=%p state=%d", sk, sk->sk_state);
- if (msk->token)
- mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
+ pr_debug("msk=%p state=%d\n", sk, sk->sk_state);
+ mptcp_pm_connection_closed(msk);
if (sk->sk_state == TCP_CLOSE) {
__mptcp_destroy_sock(sk);
do_cancel_work = true;
} else {
- mptcp_reset_timeout(msk, 0);
+ mptcp_start_tout_timer(sk);
}
return do_cancel_work;
@@ -2966,7 +3316,7 @@ static void mptcp_close(struct sock *sk, long timeout)
sock_put(sk);
}
-void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
+static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
@@ -2989,46 +3339,84 @@ void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
}
+static void mptcp_destroy_common(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = (struct sock *)msk;
+
+ __mptcp_clear_xmit(sk);
+ mptcp_backlog_purge(sk);
+
+ /* join list will be eventually flushed (with rst) at sock lock release time */
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
+ __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, 0);
+
+ __skb_queue_purge(&sk->sk_receive_queue);
+ skb_rbtree_purge(&msk->out_of_order_queue);
+
+ /* move all the rx fwd alloc into the sk_mem_reclaim_final in
+ * inet_sock_destruct() will dispose it
+ */
+ mptcp_token_destroy(msk);
+ mptcp_pm_destroy(msk);
+}
+
static int mptcp_disconnect(struct sock *sk, int flags)
{
struct mptcp_sock *msk = mptcp_sk(sk);
/* We are on the fastopen error path. We can't call straight into the
* subflows cleanup code due to lock nesting (we are already under
- * msk->firstsocket lock). Do nothing and leave the cleanup to the
- * caller.
+ * msk->firstsocket lock).
*/
if (msk->fastopening)
- return 0;
+ return -EBUSY;
- inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_check_listen_stop(sk);
+ mptcp_set_state(sk, TCP_CLOSE);
- mptcp_stop_timer(sk);
- sk_stop_timer(sk, &sk->sk_timer);
+ mptcp_stop_rtx_timer(sk);
+ mptcp_stop_tout_timer(sk);
- if (msk->token)
- mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
+ mptcp_pm_connection_closed(msk);
/* msk->subflow is still intact, the following will not free the first
* subflow
*/
- mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
- msk->last_snd = NULL;
+ mptcp_do_fastclose(sk);
+ mptcp_destroy_common(msk);
+
+ /* The first subflow is already in TCP_CLOSE status, the following
+ * can't overlap with a fallback anymore
+ */
+ spin_lock_bh(&msk->fallback_lock);
+ msk->allow_subflows = true;
+ msk->allow_infinite_fallback = true;
WRITE_ONCE(msk->flags, 0);
+ spin_unlock_bh(&msk->fallback_lock);
+
msk->cb_flags = 0;
- msk->push_pending = 0;
msk->recovery = false;
- msk->can_ack = false;
- msk->fully_established = false;
- msk->rcv_data_fin = false;
- msk->snd_data_fin_enable = false;
- msk->rcv_fastclose = false;
- msk->use_64bit_ack = false;
+ WRITE_ONCE(msk->can_ack, false);
+ WRITE_ONCE(msk->fully_established, false);
+ WRITE_ONCE(msk->rcv_data_fin, false);
+ WRITE_ONCE(msk->snd_data_fin_enable, false);
+ WRITE_ONCE(msk->rcv_fastclose, false);
+ WRITE_ONCE(msk->use_64bit_ack, false);
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
mptcp_pm_data_reset(msk);
mptcp_ca_reset(sk);
+ msk->bytes_consumed = 0;
+ msk->bytes_acked = 0;
+ msk->bytes_received = 0;
+ msk->bytes_sent = 0;
+ msk->bytes_retrans = 0;
+ msk->rcvspace_init = 0;
+
+ /* for fallback's sake */
+ WRITE_ONCE(msk->ack_seq, 0);
- sk->sk_shutdown = 0;
+ WRITE_ONCE(sk->sk_shutdown, 0);
sk_error_report(sk);
return 0;
}
@@ -3036,18 +3424,59 @@ static int mptcp_disconnect(struct sock *sk, int flags)
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
{
- unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);
+ struct mptcp6_sock *msk6 = container_of(mptcp_sk(sk), struct mptcp6_sock, msk);
+
+ return &msk6->np;
+}
+
+static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk)
+{
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt;
+ struct ipv6_pinfo *newnp;
+
+ newnp = inet6_sk(newsk);
- return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
+ rcu_read_lock();
+ opt = rcu_dereference(np->opt);
+ if (opt) {
+ opt = ipv6_dup_options(newsk, opt);
+ if (!opt)
+ net_warn_ratelimited("%s: Failed to copy ip6 options\n", __func__);
+ }
+ RCU_INIT_POINTER(newnp->opt, opt);
+ rcu_read_unlock();
}
#endif
-struct sock *mptcp_sk_clone(const struct sock *sk,
- const struct mptcp_options_received *mp_opt,
- struct request_sock *req)
+static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk)
+{
+ struct ip_options_rcu *inet_opt, *newopt = NULL;
+ const struct inet_sock *inet = inet_sk(sk);
+ struct inet_sock *newinet;
+
+ newinet = inet_sk(newsk);
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) +
+ inet_opt->opt.optlen, GFP_ATOMIC);
+ if (!newopt)
+ net_warn_ratelimited("%s: Failed to copy ip options\n", __func__);
+ }
+ RCU_INIT_POINTER(newinet->inet_opt, newopt);
+ rcu_read_unlock();
+}
+
+struct sock *mptcp_sk_clone_init(const struct sock *sk,
+ const struct mptcp_options_received *mp_opt,
+ struct sock *ssk,
+ struct request_sock *req)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
+ struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk;
if (!nsk)
@@ -3060,29 +3489,63 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
__mptcp_init_sock(nsk);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (nsk->sk_family == AF_INET6)
+ mptcp_copy_ip6_options(nsk, sk);
+ else
+#endif
+ mptcp_copy_ip_options(nsk, sk);
+
msk = mptcp_sk(nsk);
- msk->local_key = subflow_req->local_key;
- msk->token = subflow_req->token;
- msk->subflow = NULL;
+ WRITE_ONCE(msk->local_key, subflow_req->local_key);
+ WRITE_ONCE(msk->token, subflow_req->token);
+ msk->in_accept_queue = 1;
WRITE_ONCE(msk->fully_established, false);
if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD)
WRITE_ONCE(msk->csum_enabled, true);
- msk->write_seq = subflow_req->idsn + 1;
- msk->snd_nxt = msk->write_seq;
- msk->snd_una = msk->write_seq;
- msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
+ WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1);
+ WRITE_ONCE(msk->snd_nxt, msk->write_seq);
+ WRITE_ONCE(msk->snd_una, msk->write_seq);
+ WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
+ mptcp_init_sched(msk, mptcp_sk(sk)->sched);
- sock_reset_flag(nsk, SOCK_RCU_FREE);
- /* will be fully established after successful MPC subflow creation */
- inet_sk_state_store(nsk, TCP_SYN_RECV);
+ /* passive msk is created after the first/MPC subflow */
+ msk->subflow_id = 2;
+ sock_reset_flag(nsk, SOCK_RCU_FREE);
security_inet_csk_clone(nsk, req);
+
+ /* this can't race with mptcp_close(), as the msk is
+ * not yet exposted to user-space
+ */
+ mptcp_set_state(nsk, TCP_ESTABLISHED);
+
+ /* The msk maintain a ref to each subflow in the connections list */
+ WRITE_ONCE(msk->first, ssk);
+ subflow = mptcp_subflow_ctx(ssk);
+ list_add(&subflow->node, &msk->conn_list);
+ sock_hold(ssk);
+
+ /* new mpc subflow takes ownership of the newly
+ * created mptcp socket
+ */
+ mptcp_token_accept(subflow_req, msk);
+
+ /* set msk addresses early to ensure mptcp_pm_get_local_id()
+ * uses the correct data
+ */
+ mptcp_copy_inaddrs(nsk, ssk);
+ __mptcp_propagate_sndbuf(nsk, ssk);
+
+ mptcp_rcv_space_init(msk, ssk);
+
+ if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
+ __mptcp_subflow_fully_established(msk, subflow, mp_opt);
bh_unlock_sock(nsk);
- /* keep a single reference */
- __sock_put(nsk);
+ /* note: the newly allocated socket refcount is 2 now */
return nsk;
}
@@ -3090,6 +3553,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
{
const struct tcp_sock *tp = tcp_sk(ssk);
+ msk->rcvspace_init = 1;
msk->rcvq_space.copied = 0;
msk->rcvq_space.rtt_us = 0;
@@ -3100,95 +3564,15 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
TCP_INIT_CWND * tp->advmss);
if (msk->rcvq_space.space == 0)
msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
-
- WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
-}
-
-static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
- bool kern)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *listener;
- struct sock *newsk;
-
- listener = __mptcp_nmpc_socket(msk);
- if (WARN_ON_ONCE(!listener)) {
- *err = -EINVAL;
- return NULL;
- }
-
- pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk));
- newsk = inet_csk_accept(listener->sk, flags, err, kern);
- if (!newsk)
- return NULL;
-
- pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
- if (sk_is_mptcp(newsk)) {
- struct mptcp_subflow_context *subflow;
- struct sock *new_mptcp_sock;
-
- subflow = mptcp_subflow_ctx(newsk);
- new_mptcp_sock = subflow->conn;
-
- /* is_mptcp should be false if subflow->conn is missing, see
- * subflow_syn_recv_sock()
- */
- if (WARN_ON_ONCE(!new_mptcp_sock)) {
- tcp_sk(newsk)->is_mptcp = 0;
- goto out;
- }
-
- /* acquire the 2nd reference for the owning socket */
- sock_hold(new_mptcp_sock);
- newsk = new_mptcp_sock;
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
- } else {
- MPTCP_INC_STATS(sock_net(sk),
- MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
- }
-
-out:
- newsk->sk_kern_sock = kern;
- return newsk;
-}
-
-void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
-{
- struct mptcp_subflow_context *subflow, *tmp;
- struct sock *sk = (struct sock *)msk;
-
- __mptcp_clear_xmit(sk);
-
- /* join list will be eventually flushed (with rst) at sock lock release time */
- mptcp_for_each_subflow_safe(msk, subflow, tmp)
- __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags);
-
- /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
- mptcp_data_lock(sk);
- skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
- __skb_queue_purge(&sk->sk_receive_queue);
- skb_rbtree_purge(&msk->out_of_order_queue);
- mptcp_data_unlock(sk);
-
- /* move all the rx fwd alloc into the sk_mem_reclaim_final in
- * inet_sock_destruct() will dispose it
- */
- sk->sk_forward_alloc += msk->rmem_fwd_alloc;
- msk->rmem_fwd_alloc = 0;
- mptcp_token_destroy(msk);
- mptcp_pm_free_anno_list(msk);
- mptcp_free_local_addr_list(msk);
}
static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- /* clears msk->subflow, allowing the following to close
- * even the initial subflow
- */
- mptcp_dispose_initial_subflow(msk);
- mptcp_destroy_common(msk, 0);
+ /* allow the following to close even the initial subflow */
+ msk->free_first = 1;
+ mptcp_destroy_common(msk);
sk_sockets_allocated_dec(sk);
}
@@ -3198,16 +3582,10 @@ void __mptcp_data_acked(struct sock *sk)
__mptcp_clean_una(sk);
else
__set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags);
-
- if (mptcp_pending_data_fin_ack(sk))
- mptcp_schedule_work(sk);
}
void __mptcp_check_push(struct sock *sk, struct sock *ssk)
{
- if (!mptcp_send_head(sk))
- return;
-
if (!sock_owned_by_user(sk))
__mptcp_subflow_push_pending(sk, ssk, false);
else
@@ -3225,11 +3603,18 @@ static void mptcp_release_cb(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(sk);
for (;;) {
- unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) |
- msk->push_pending;
- if (!flags)
+ unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED);
+ struct list_head join_list, skbs;
+ bool spool_bl;
+ u32 moved;
+
+ spool_bl = mptcp_can_spool_backlog(sk, &skbs);
+ if (!flags && !spool_bl)
break;
+ INIT_LIST_HEAD(&join_list);
+ list_splice_init(&msk->join_list, &join_list);
+
/* the following actions acquire the subflow socket lock
*
* 1) can't be invoked in atomic scope
@@ -3237,35 +3622,41 @@ static void mptcp_release_cb(struct sock *sk)
* datapath acquires the msk socket spinlock while helding
* the subflow socket lock
*/
- msk->push_pending = 0;
msk->cb_flags &= ~flags;
spin_unlock_bh(&sk->sk_lock.slock);
+
if (flags & BIT(MPTCP_FLUSH_JOIN_LIST))
- __mptcp_flush_join_list(sk);
+ __mptcp_flush_join_list(sk, &join_list);
if (flags & BIT(MPTCP_PUSH_PENDING))
__mptcp_push_pending(sk, 0);
if (flags & BIT(MPTCP_RETRANSMIT))
__mptcp_retrans(sk);
+ if (spool_bl && __mptcp_move_skbs(sk, &skbs, &moved)) {
+ /* notify ack seq update */
+ mptcp_cleanup_rbuf(msk, 0);
+ sk->sk_data_ready(sk);
+ }
cond_resched();
spin_lock_bh(&sk->sk_lock.slock);
+ if (spool_bl)
+ mptcp_backlog_spooled(sk, moved, &skbs);
}
if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
__mptcp_clean_una_wakeup(sk);
- if (unlikely(&msk->cb_flags)) {
- /* be sure to set the current sk state before tacking actions
- * depending on sk_state, that is processing MPTCP_ERROR_REPORT
+ if (unlikely(msk->cb_flags)) {
+ /* be sure to sync the msk state before taking actions
+ * depending on sk_state (MPTCP_ERROR_REPORT)
+ * On sk release avoid actions depending on the first subflow
*/
- if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags))
- __mptcp_set_connected(sk);
+ if (__test_and_clear_bit(MPTCP_SYNC_STATE, &msk->cb_flags) && msk->first)
+ __mptcp_sync_state(sk, msk->pending_state);
if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
__mptcp_error_report(sk);
- if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags))
- msk->last_snd = NULL;
+ if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
+ __mptcp_sync_sndbuf(sk);
}
-
- __mptcp_update_rmem(sk);
}
/* MP_JOIN client subflow must wait for 4th ack before sending any data:
@@ -3278,7 +3669,7 @@ static void schedule_3rdack_retransmission(struct sock *ssk)
struct tcp_sock *tp = tcp_sk(ssk);
unsigned long timeout;
- if (mptcp_subflow_ctx(ssk)->fully_established)
+ if (READ_ONCE(mptcp_subflow_ctx(ssk)->fully_established))
return;
/* reschedule with a timeout above RTT, as we must look only for drop */
@@ -3289,35 +3680,40 @@ static void schedule_3rdack_retransmission(struct sock *ssk)
timeout += jiffies;
WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
- icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
- icsk->icsk_ack.timeout = timeout;
+ smp_store_release(&icsk->icsk_ack.pending,
+ icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER);
sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout);
}
-void mptcp_subflow_process_delegated(struct sock *ssk)
+void mptcp_subflow_process_delegated(struct sock *ssk, long status)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = subflow->conn;
- if (test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) {
+ if (status & BIT(MPTCP_DELEGATE_SEND)) {
mptcp_data_lock(sk);
if (!sock_owned_by_user(sk))
__mptcp_subflow_push_pending(sk, ssk, true);
else
__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
- mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND);
}
- if (test_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status)) {
- schedule_3rdack_retransmission(ssk);
- mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_ACK);
+ if (status & BIT(MPTCP_DELEGATE_SNDBUF)) {
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk))
+ __mptcp_sync_sndbuf(sk);
+ else
+ __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags);
+ mptcp_data_unlock(sk);
}
+ if (status & BIT(MPTCP_DELEGATE_ACK))
+ schedule_3rdack_retransmission(ssk);
}
static int mptcp_hash(struct sock *sk)
{
/* should never be called,
- * we hash the TCP subflows not the master socket
+ * we hash the TCP subflows not the MPTCP socket
*/
WARN_ON_ONCE(1);
return 0;
@@ -3331,14 +3727,12 @@ static void mptcp_unhash(struct sock *sk)
static int mptcp_get_port(struct sock *sk, unsigned short snum)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
- ssock = __mptcp_nmpc_socket(msk);
- pr_debug("msk=%p, subflow=%p", msk, ssock);
- if (WARN_ON_ONCE(!ssock))
+ pr_debug("msk=%p, ssk=%p\n", msk, msk->first);
+ if (WARN_ON_ONCE(!msk->first))
return -EINVAL;
- return inet_csk_get_port(ssock->sk, snum);
+ return inet_csk_get_port(msk->first, snum);
}
void mptcp_finish_connect(struct sock *ssk)
@@ -3351,7 +3745,7 @@ void mptcp_finish_connect(struct sock *ssk)
sk = subflow->conn;
msk = mptcp_sk(sk);
- pr_debug("msk=%p, token=%u", sk, subflow->token);
+ pr_debug("msk=%p, token=%u\n", sk, subflow->token);
subflow->map_seq = subflow->iasn;
subflow->map_subflow_seq = 1;
@@ -3360,13 +3754,8 @@ void mptcp_finish_connect(struct sock *ssk)
* accessing the field below
*/
WRITE_ONCE(msk->local_key, subflow->local_key);
- WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
- WRITE_ONCE(msk->snd_nxt, msk->write_seq);
- WRITE_ONCE(msk->snd_una, msk->write_seq);
mptcp_pm_new_connection(msk, ssk, 0);
-
- mptcp_rcv_space_init(msk, ssk);
}
void mptcp_sock_graft(struct sock *sk, struct socket *parent)
@@ -3374,10 +3763,26 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent)
write_lock_bh(&sk->sk_callback_lock);
rcu_assign_pointer(sk->sk_wq, &parent->wq);
sk_set_socket(sk, parent);
- sk->sk_uid = SOCK_INODE(parent)->i_uid;
write_unlock_bh(&sk->sk_callback_lock);
}
+/* Can be called without holding the msk socket lock; use the callback lock
+ * to avoid {READ_,WRITE_}ONCE annotations on sk_socket.
+ */
+static void mptcp_sock_check_graft(struct sock *sk, struct sock *ssk)
+{
+ struct socket *sock;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ sock = sk->sk_socket;
+ write_unlock_bh(&sk->sk_callback_lock);
+ if (sock) {
+ mptcp_sock_graft(ssk, sock);
+ __mptcp_inherit_cgrp_data(sk, ssk);
+ __mptcp_inherit_memcg(sk, ssk, GFP_ATOMIC);
+ }
+}
+
bool mptcp_finish_join(struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
@@ -3385,7 +3790,7 @@ bool mptcp_finish_join(struct sock *ssk)
struct sock *parent = (void *)msk;
bool ret = true;
- pr_debug("msk=%p, subflow=%p", msk, subflow);
+ pr_debug("msk=%p, subflow=%p\n", msk, subflow);
/* mptcp socket already closing? */
if (!mptcp_is_fully_established(parent)) {
@@ -3393,14 +3798,27 @@ bool mptcp_finish_join(struct sock *ssk)
return false;
}
- if (!list_empty(&subflow->node))
- goto out;
+ /* Active subflow, already present inside the conn_list; is grafted
+ * either by __mptcp_subflow_connect() or accept.
+ */
+ if (!list_empty(&subflow->node)) {
+ spin_lock_bh(&msk->fallback_lock);
+ if (!msk->allow_subflows) {
+ spin_unlock_bh(&msk->fallback_lock);
+ return false;
+ }
+ mptcp_subflow_joined(msk, ssk);
+ spin_unlock_bh(&msk->fallback_lock);
+ mptcp_propagate_sndbuf(parent, ssk);
+ return true;
+ }
- if (!mptcp_pm_allow_new_subflow(msk))
+ if (!mptcp_pm_allow_new_subflow(msk)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_JOINREJECTED);
goto err_prohibited;
+ }
- /* active connections are already on conn_list.
- * If we can't acquire msk socket lock here, let the release callback
+ /* If we can't acquire msk socket lock here, let the release callback
* handle it
*/
mptcp_data_lock(parent);
@@ -3409,11 +3827,17 @@ bool mptcp_finish_join(struct sock *ssk)
if (ret) {
sock_hold(ssk);
list_add_tail(&subflow->node, &msk->conn_list);
+ mptcp_sock_check_graft(parent, ssk);
}
} else {
sock_hold(ssk);
list_add_tail(&subflow->node, &msk->join_list);
__set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags);
+
+ /* In case of later failures, __mptcp_flush_join_list() will
+ * properly orphan the ssk via mptcp_close_ssk().
+ */
+ mptcp_sock_check_graft(parent, ssk);
}
mptcp_data_unlock(parent);
@@ -3423,27 +3847,17 @@ err_prohibited:
return false;
}
- subflow->map_seq = READ_ONCE(msk->ack_seq);
- WRITE_ONCE(msk->allow_infinite_fallback, false);
-
-out:
- mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
return true;
}
static void mptcp_shutdown(struct sock *sk, int how)
{
- pr_debug("sk=%p, how=%d", sk, how);
+ pr_debug("sk=%p, how=%d\n", sk, how);
if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk))
__mptcp_wr_shutdown(sk);
}
-static int mptcp_forward_alloc_get(const struct sock *sk)
-{
- return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc;
-}
-
static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)
{
const struct sock *sk = (void *)msk;
@@ -3473,11 +3887,10 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)
return (int)delta;
}
-static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
{
struct mptcp_sock *msk = mptcp_sk(sk);
bool slow;
- int answ;
switch (cmd) {
case SIOCINQ:
@@ -3485,87 +3898,104 @@ static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -EINVAL;
lock_sock(sk);
- __mptcp_move_skbs(msk);
- answ = mptcp_inq_hint(sk);
+ if (mptcp_move_skbs(sk))
+ mptcp_cleanup_rbuf(msk, 0);
+ *karg = mptcp_inq_hint(sk);
release_sock(sk);
break;
case SIOCOUTQ:
slow = lock_sock_fast(sk);
- answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una));
+ *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una));
unlock_sock_fast(sk, slow);
break;
case SIOCOUTQNSD:
slow = lock_sock_fast(sk);
- answ = mptcp_ioctl_outq(msk, msk->snd_nxt);
+ *karg = mptcp_ioctl_outq(msk, msk->snd_nxt);
unlock_sock_fast(sk, slow);
break;
default:
return -ENOIOCTLCMD;
}
- return put_user(answ, (int __user *)arg);
-}
-
-static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
- struct mptcp_subflow_context *subflow)
-{
- subflow->request_mptcp = 0;
- __mptcp_do_fallback(msk);
+ return 0;
}
-static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int mptcp_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
int err = -EINVAL;
+ struct sock *ssk;
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock)
- return -EINVAL;
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk))
+ return PTR_ERR(ssk);
- mptcp_token_destroy(msk);
- inet_sk_state_store(sk, TCP_SYN_SENT);
- subflow = mptcp_subflow_ctx(ssock->sk);
+ mptcp_set_state(sk, TCP_SYN_SENT);
+ subflow = mptcp_subflow_ctx(ssk);
#ifdef CONFIG_TCP_MD5SIG
/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
* TCP option space.
*/
- if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
- mptcp_subflow_early_fallback(msk, subflow);
+ if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info))
+ mptcp_early_fallback(msk, subflow, MPTCP_MIB_MD5SIGFALLBACK);
#endif
- if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) {
- MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT);
- mptcp_subflow_early_fallback(msk, subflow);
+ if (subflow->request_mptcp) {
+ if (mptcp_active_should_disable(sk))
+ mptcp_early_fallback(msk, subflow,
+ MPTCP_MIB_MPCAPABLEACTIVEDISABLED);
+ else if (mptcp_token_new_connect(ssk) < 0)
+ mptcp_early_fallback(msk, subflow,
+ MPTCP_MIB_TOKENFALLBACKINIT);
}
+
+ WRITE_ONCE(msk->write_seq, subflow->idsn);
+ WRITE_ONCE(msk->snd_nxt, subflow->idsn);
+ WRITE_ONCE(msk->snd_una, subflow->idsn);
if (likely(!__mptcp_check_fallback(msk)))
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE);
/* if reaching here via the fastopen/sendmsg path, the caller already
* acquired the subflow socket lock, too.
*/
- if (msk->fastopening)
- err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1);
- else
- err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags);
- inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect;
+ if (!msk->fastopening)
+ lock_sock(ssk);
+
+ /* the following mirrors closely a very small chunk of code from
+ * __inet_stream_connect()
+ */
+ if (ssk->sk_state != TCP_CLOSE)
+ goto out;
+
+ if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) {
+ err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len);
+ if (err)
+ goto out;
+ }
+
+ err = ssk->sk_prot->connect(ssk, uaddr, addr_len);
+ if (err < 0)
+ goto out;
+
+ inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk));
+
+out:
+ if (!msk->fastopening)
+ release_sock(ssk);
/* on successful connect, the msk state will be moved to established by
* subflow_finish_connect()
*/
- if (unlikely(err && err != -EINPROGRESS)) {
- inet_sk_state_store(sk, inet_sk_state_load(ssock->sk));
+ if (unlikely(err)) {
+ /* avoid leaving a dangling token in an unconnected socket */
+ mptcp_token_destroy(msk);
+ mptcp_set_state(sk, TCP_CLOSE);
return err;
}
- mptcp_copy_inaddrs(sk, ssock->sk);
-
- /* unblocking connect, mptcp-level inet_stream_connect will error out
- * without changing the socket state, update it here.
- */
- if (err == -EINPROGRESS)
- sk->sk_socket->state = ssock->state;
- return err;
+ mptcp_copy_inaddrs(sk, ssk);
+ return 0;
}
static struct proto mptcp_prot = {
@@ -3575,7 +4005,6 @@ static struct proto mptcp_prot = {
.connect = mptcp_connect,
.disconnect = mptcp_disconnect,
.close = mptcp_close,
- .accept = mptcp_accept,
.setsockopt = mptcp_setsockopt,
.getsockopt = mptcp_getsockopt,
.shutdown = mptcp_shutdown,
@@ -3587,10 +4016,10 @@ static struct proto mptcp_prot = {
.hash = mptcp_hash,
.unhash = mptcp_unhash,
.get_port = mptcp_get_port,
- .forward_alloc_get = mptcp_forward_alloc_get,
+ .stream_memory_free = mptcp_stream_memory_free,
.sockets_allocated = &mptcp_sockets_allocated,
- .memory_allocated = &tcp_memory_allocated,
+ .memory_allocated = &net_aligned_data.tcp_memory_allocated,
.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
.memory_pressure = &tcp_memory_pressure,
@@ -3602,138 +4031,224 @@ static struct proto mptcp_prot = {
.no_autobind = true,
};
-static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int mptcp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
- struct socket *ssock;
- int err;
+ struct sock *ssk, *sk = sock->sk;
+ int err = -EINVAL;
- lock_sock(sock->sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- err = -EINVAL;
+ lock_sock(sk);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ err = PTR_ERR(ssk);
goto unlock;
}
- err = ssock->ops->bind(ssock, uaddr, addr_len);
+ if (sk->sk_family == AF_INET)
+ err = inet_bind_sk(ssk, uaddr, addr_len);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (sk->sk_family == AF_INET6)
+ err = inet6_bind_sk(ssk, uaddr, addr_len);
+#endif
if (!err)
- mptcp_copy_inaddrs(sock->sk, ssock->sk);
+ mptcp_copy_inaddrs(sk, ssk);
unlock:
- release_sock(sock->sk);
+ release_sock(sk);
return err;
}
-static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
-{
- int ret;
-
- lock_sock(sock->sk);
- mptcp_sk(sock->sk)->connect_flags = flags;
- ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
- release_sock(sock->sk);
- return ret;
-}
-
static int mptcp_listen(struct socket *sock, int backlog)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
- struct socket *ssock;
+ struct sock *sk = sock->sk;
+ struct sock *ssk;
int err;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
+
+ lock_sock(sk);
- lock_sock(sock->sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- err = -EINVAL;
+ err = -EINVAL;
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+ goto unlock;
+
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ err = PTR_ERR(ssk);
goto unlock;
}
- mptcp_token_destroy(msk);
- inet_sk_state_store(sock->sk, TCP_LISTEN);
- sock_set_flag(sock->sk, SOCK_RCU_FREE);
+ mptcp_set_state(sk, TCP_LISTEN);
+ sock_set_flag(sk, SOCK_RCU_FREE);
- err = ssock->ops->listen(ssock, backlog);
- inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
- if (!err)
- mptcp_copy_inaddrs(sock->sk, ssock->sk);
+ lock_sock(ssk);
+ err = __inet_listen_sk(ssk, backlog);
+ release_sock(ssk);
+ mptcp_set_state(sk, inet_sk_state_load(ssk));
- mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED);
+ if (!err) {
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ mptcp_copy_inaddrs(sk, ssk);
+ mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
+ }
unlock:
- release_sock(sock->sk);
+ release_sock(sk);
return err;
}
+static void mptcp_graft_subflows(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (mem_cgroup_sockets_enabled) {
+ LIST_HEAD(join_list);
+
+ /* Subflows joining after __inet_accept() will get the
+ * mem CG properly initialized at mptcp_finish_join() time,
+ * but subflows pending in join_list need explicit
+ * initialization before flushing `backlog_unaccounted`
+ * or MPTCP can later unexpectedly observe unaccounted memory.
+ */
+ mptcp_data_lock(sk);
+ list_splice_init(&msk->join_list, &join_list);
+ mptcp_data_unlock(sk);
+
+ __mptcp_flush_join_list(sk, &join_list);
+ }
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+
+ /* Set ssk->sk_socket of accept()ed flows to mptcp socket.
+ * This is needed so NOSPACE flag can be set from tcp stack.
+ */
+ if (!ssk->sk_socket)
+ mptcp_sock_graft(ssk, sk->sk_socket);
+
+ if (!mem_cgroup_sk_enabled(sk))
+ goto unlock;
+
+ __mptcp_inherit_cgrp_data(sk, ssk);
+ __mptcp_inherit_memcg(sk, ssk, GFP_KERNEL);
+
+unlock:
+ release_sock(ssk);
+ }
+
+ if (mem_cgroup_sk_enabled(sk)) {
+ gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
+ int amt;
+
+ /* Account the backlog memory; prior accept() is aware of
+ * fwd and rmem only.
+ */
+ mptcp_data_lock(sk);
+ amt = sk_mem_pages(sk->sk_forward_alloc +
+ msk->backlog_unaccounted +
+ atomic_read(&sk->sk_rmem_alloc)) -
+ sk_mem_pages(sk->sk_forward_alloc +
+ atomic_read(&sk->sk_rmem_alloc));
+ msk->backlog_unaccounted = 0;
+ mptcp_data_unlock(sk);
+
+ if (amt)
+ mem_cgroup_sk_charge(sk, amt, gfp);
+ }
+}
+
static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
- int flags, bool kern)
+ struct proto_accept_arg *arg)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
- struct socket *ssock;
- int err;
+ struct sock *ssk, *newsk;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock)
+ /* Buggy applications can call accept on socket states other then LISTEN
+ * but no need to allocate the first subflow just to error out.
+ */
+ ssk = READ_ONCE(msk->first);
+ if (!ssk)
return -EINVAL;
- err = ssock->ops->accept(sock, newsock, flags, kern);
- if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
- struct mptcp_sock *msk = mptcp_sk(newsock->sk);
+ pr_debug("ssk=%p, listener=%p\n", ssk, mptcp_subflow_ctx(ssk));
+ newsk = inet_csk_accept(ssk, arg);
+ if (!newsk)
+ return arg->err;
+
+ pr_debug("newsk=%p, subflow is mptcp=%d\n", newsk, sk_is_mptcp(newsk));
+ if (sk_is_mptcp(newsk)) {
struct mptcp_subflow_context *subflow;
- struct sock *newsk = newsock->sk;
+ struct sock *new_mptcp_sock;
- set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
+ subflow = mptcp_subflow_ctx(newsk);
+ new_mptcp_sock = subflow->conn;
+
+ /* is_mptcp should be false if subflow->conn is missing, see
+ * subflow_syn_recv_sock()
+ */
+ if (WARN_ON_ONCE(!new_mptcp_sock)) {
+ tcp_sk(newsk)->is_mptcp = 0;
+ goto tcpfallback;
+ }
+ newsk = new_mptcp_sock;
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
+
+ newsk->sk_kern_sock = arg->kern;
lock_sock(newsk);
+ __inet_accept(sock, newsock, newsk);
- /* PM/worker can now acquire the first subflow socket
- * lock without racing with listener queue cleanup,
- * we can notify it, if needed.
- *
- * Even if remote has reset the initial subflow by now
- * the refcnt is still at least one.
- */
- subflow = mptcp_subflow_ctx(msk->first);
- list_add(&subflow->node, &msk->conn_list);
- sock_hold(msk->first);
- if (mptcp_is_fully_established(newsk))
- mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL);
+ set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
+ msk = mptcp_sk(newsk);
+ msk->in_accept_queue = 0;
- mptcp_rcv_space_init(msk, msk->first);
- mptcp_propagate_sndbuf(newsk, msk->first);
+ mptcp_graft_subflows(newsk);
+ mptcp_rps_record_subflows(msk);
- /* set ssk->sk_socket of accept()ed flows to mptcp socket.
- * This is needed so NOSPACE flag can be set from tcp stack.
+ /* Do late cleanup for the first subflow as necessary. Also
+ * deal with bad peers not doing a complete shutdown.
*/
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- if (!ssk->sk_socket)
- mptcp_sock_graft(ssk, newsock);
+ if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) {
+ if (unlikely(list_is_singular(&msk->conn_list)))
+ mptcp_set_state(newsk, TCP_CLOSE);
+ mptcp_close_ssk(newsk, msk->first,
+ mptcp_subflow_ctx(msk->first));
}
- release_sock(newsk);
+ } else {
+tcpfallback:
+ newsk->sk_kern_sock = arg->kern;
+ lock_sock(newsk);
+ __inet_accept(sock, newsock, newsk);
+ /* we are being invoked after accepting a non-mp-capable
+ * flow: sk is a tcp_sk, not an mptcp one.
+ *
+ * Hand the socket over to tcp so all further socket ops
+ * bypass mptcp.
+ */
+ WRITE_ONCE(newsock->sk->sk_socket->ops,
+ mptcp_fallback_tcp_ops(newsock->sk));
}
+ release_sock(newsk);
- return err;
+ return 0;
}
static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
- if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
+ if (__mptcp_stream_is_writeable(sk, 1))
return EPOLLOUT | EPOLLWRNORM;
- if (sk_stream_is_writeable(sk))
- return EPOLLOUT | EPOLLWRNORM;
-
- mptcp_set_nospace(sk);
- smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
- if (sk_stream_is_writeable(sk))
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */
+ if (__mptcp_stream_is_writeable(sk, 1))
return EPOLLOUT | EPOLLWRNORM;
return 0;
@@ -3745,35 +4260,44 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
struct sock *sk = sock->sk;
struct mptcp_sock *msk;
__poll_t mask = 0;
+ u8 shutdown;
int state;
msk = mptcp_sk(sk);
sock_poll_wait(file, sock, wait);
state = inet_sk_state_load(sk);
- pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
+ pr_debug("msk=%p state=%d flags=%lx\n", msk, state, msk->flags);
if (state == TCP_LISTEN) {
- if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk))
+ struct sock *ssk = READ_ONCE(msk->first);
+
+ if (WARN_ON_ONCE(!ssk))
return 0;
- return inet_csk_listen_poll(msk->subflow->sk);
+ return inet_csk_listen_poll(ssk);
}
+ shutdown = READ_ONCE(sk->sk_shutdown);
+ if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
+ mask |= EPOLLHUP;
+ if (shutdown & RCV_SHUTDOWN)
+ mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
- mask |= mptcp_check_readable(msk);
- mask |= mptcp_check_writeable(msk);
- } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ mask |= mptcp_check_readable(sk);
+ if (shutdown & SEND_SHUTDOWN)
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ else
+ mask |= mptcp_check_writeable(msk);
+ } else if (state == TCP_SYN_SENT &&
+ inet_test_bit(DEFER_CONNECT, sk)) {
/* cf tcp_poll() note about TFO */
mask |= EPOLLOUT | EPOLLWRNORM;
}
- if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
- mask |= EPOLLHUP;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
/* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
smp_rmb();
- if (sk->sk_err)
+ if (READ_ONCE(sk->sk_err))
mask |= EPOLLERR;
return mask;
@@ -3784,7 +4308,7 @@ static const struct proto_ops mptcp_stream_ops = {
.owner = THIS_MODULE,
.release = inet_release,
.bind = mptcp_bind,
- .connect = mptcp_stream_connect,
+ .connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = mptcp_stream_accept,
.getname = inet_getname,
@@ -3798,7 +4322,7 @@ static const struct proto_ops mptcp_stream_ops = {
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
+ .set_rcvlowat = mptcp_set_rcvlowat,
};
static struct inet_protosw mptcp_protosw = {
@@ -3820,14 +4344,17 @@ static int mptcp_napi_poll(struct napi_struct *napi, int budget)
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bh_lock_sock_nested(ssk);
- if (!sock_owned_by_user(ssk) &&
- mptcp_subflow_has_delegated_action(subflow))
- mptcp_subflow_process_delegated(ssk);
- /* ... elsewhere tcp_release_cb_override already processed
- * the action or will do at next release_sock().
- * In both case must dequeue the subflow here - on the same
- * CPU that scheduled it.
- */
+ if (!sock_owned_by_user(ssk)) {
+ mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0));
+ } else {
+ /* tcp_release_cb_override already processed
+ * the action or will do at next release_sock().
+ * In both case must dequeue the subflow here - on the same
+ * CPU that scheduled it.
+ */
+ smp_wmb();
+ clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status);
+ }
bh_unlock_sock(ssk);
sock_put(ssk);
@@ -3852,17 +4379,20 @@ void __init mptcp_proto_init(void)
if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
panic("Failed to allocate MPTCP pcpu counter\n");
- init_dummy_netdev(&mptcp_napi_dev);
+ mptcp_napi_dev = alloc_netdev_dummy(0);
+ if (!mptcp_napi_dev)
+ panic("Failed to allocate MPTCP dummy netdev\n");
for_each_possible_cpu(cpu) {
delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
INIT_LIST_HEAD(&delegated->head);
- netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi,
+ netif_napi_add_tx(mptcp_napi_dev, &delegated->napi,
mptcp_napi_poll);
napi_enable(&delegated->napi);
}
mptcp_subflow_init();
mptcp_pm_init();
+ mptcp_sched_init();
mptcp_token_init();
if (proto_register(&mptcp_prot, 1) != 0)
@@ -3879,7 +4409,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.owner = THIS_MODULE,
.release = inet6_release,
.bind = mptcp_bind,
- .connect = mptcp_stream_connect,
+ .connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = mptcp_stream_accept,
.getname = inet6_getname,
@@ -3893,10 +4423,10 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.sendmsg = inet6_sendmsg,
.recvmsg = inet6_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_ioctl = inet6_compat_ioctl,
#endif
+ .set_rcvlowat = mptcp_set_rcvlowat,
};
static struct proto mptcp_v6_prot;
@@ -3914,9 +4444,10 @@ int __init mptcp_proto_v6_init(void)
int err;
mptcp_v6_prot = mptcp_prot;
- strcpy(mptcp_v6_prot.name, "MPTCPv6");
+ strscpy(mptcp_v6_prot.name, "MPTCPv6", sizeof(mptcp_v6_prot.name));
mptcp_v6_prot.slab = NULL;
mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
+ mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np);
err = proto_register(&mptcp_v6_prot, 1);
if (err)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 601469249da8..9c0d17876b22 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -12,6 +12,7 @@
#include <net/inet_connection_sock.h>
#include <uapi/linux/mptcp.h>
#include <net/genetlink.h>
+#include <net/rstreason.h>
#define MPTCP_SUPPORTED_VERSION 1
@@ -111,11 +112,9 @@
#define MPTCP_RST_TRANSIENT BIT(0)
/* MPTCP socket atomic flags */
-#define MPTCP_NOSPACE 1
-#define MPTCP_WORK_RTX 2
-#define MPTCP_WORK_EOF 3
-#define MPTCP_FALLBACK_DONE 4
-#define MPTCP_WORK_CLOSE_SUBFLOW 5
+#define MPTCP_WORK_RTX 1
+#define MPTCP_FALLBACK_DONE 2
+#define MPTCP_WORK_CLOSE_SUBFLOW 3
/* MPTCP socket release cb flags */
#define MPTCP_PUSH_PENDING 1
@@ -123,14 +122,15 @@
#define MPTCP_ERROR_REPORT 3
#define MPTCP_RETRANSMIT 4
#define MPTCP_FLUSH_JOIN_LIST 5
-#define MPTCP_CONNECTED 6
-#define MPTCP_RESET_SCHEDULER 7
+#define MPTCP_SYNC_STATE 6
+#define MPTCP_SYNC_SNDBUF 7
struct mptcp_skb_cb {
u64 map_seq;
u64 end_seq;
u32 offset;
- u8 has_rxtstamp:1;
+ u8 has_rxtstamp;
+ u8 cant_coalesce;
};
#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
@@ -150,22 +150,24 @@ struct mptcp_options_received {
u32 subflow_seq;
u16 data_len;
__sum16 csum;
- u16 suboptions;
+ struct_group(status,
+ u16 suboptions;
+ u16 use_map:1,
+ dsn64:1,
+ data_fin:1,
+ use_ack:1,
+ ack64:1,
+ mpc_map:1,
+ reset_reason:4,
+ reset_transient:1,
+ echo:1,
+ backup:1,
+ deny_join_id0:1,
+ __unused:2;
+ );
+ u8 join_id;
u32 token;
u32 nonce;
- u16 use_map:1,
- dsn64:1,
- data_fin:1,
- use_ack:1,
- ack64:1,
- mpc_map:1,
- reset_reason:4,
- reset_transient:1,
- echo:1,
- backup:1,
- deny_join_id0:1,
- __unused:2;
- u8 join_id;
u64 thmac;
u8 hmac[MPTCPOPT_HMAC_LEN];
struct mptcp_addr_info addr;
@@ -220,6 +222,8 @@ struct mptcp_pm_data {
spinlock_t lock; /*protects the whole PM data */
+ struct_group(reset,
+
u8 addr_signal;
bool server_side;
bool work_pending;
@@ -230,13 +234,22 @@ struct mptcp_pm_data {
u8 add_addr_accepted;
u8 local_addr_used;
u8 pm_type;
- u8 subflows;
+ u8 extra_subflows;
u8 status;
+
+ );
+
DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
struct mptcp_rm_list rm_list_tx;
struct mptcp_rm_list rm_list_rx;
};
+struct mptcp_pm_local {
+ struct mptcp_addr_info addr;
+ u8 flags;
+ int ifindex;
+};
+
struct mptcp_pm_addr_entry {
struct list_head list;
struct mptcp_addr_info addr;
@@ -259,29 +272,35 @@ struct mptcp_data_frag {
struct mptcp_sock {
/* inet_connection_sock must be the first member */
struct inet_connection_sock sk;
- u64 local_key;
- u64 remote_key;
+ u64 local_key; /* protected by the first subflow socket lock
+ * lockless access read
+ */
+ u64 remote_key; /* same as above */
u64 write_seq;
+ u64 bytes_sent;
u64 snd_nxt;
+ u64 bytes_received;
u64 ack_seq;
atomic64_t rcv_wnd_sent;
u64 rcv_data_fin_seq;
- int rmem_fwd_alloc;
- struct sock *last_snd;
+ u64 bytes_retrans;
+ u64 bytes_consumed;
int snd_burst;
int old_wspace;
u64 recovery_snd_nxt; /* in recovery mode accept up to this seq;
* recovery related fields are under data_lock
* protection
*/
+ u64 bytes_acked;
u64 snd_una;
u64 wnd_end;
+ u32 last_data_sent;
+ u32 last_data_recv;
+ u32 last_ack_recv;
unsigned long timer_ival;
u32 token;
- int rmem_released;
unsigned long flags;
unsigned long cb_flags;
- unsigned long push_pending;
bool recovery; /* closing subflow write queue reinjected */
bool can_ack;
bool fully_established;
@@ -291,33 +310,56 @@ struct mptcp_sock {
bool use_64bit_ack; /* Set when we received a 64-bit DSN */
bool csum_enabled;
bool allow_infinite_fallback;
+ u8 pending_state; /* A subflow asked to set this sk_state,
+ * protected by the msk data lock
+ */
u8 mpc_endpoint_id;
u8 recvmsg_inq:1,
cork:1,
nodelay:1,
- fastopening:1;
- int connect_flags;
+ fastopening:1,
+ in_accept_queue:1,
+ free_first:1,
+ rcvspace_init:1;
+ u32 notsent_lowat;
+ int keepalive_cnt;
+ int keepalive_idle;
+ int keepalive_intvl;
+ int maxseg;
struct work_struct work;
struct sk_buff *ooo_last_skb;
struct rb_root out_of_order_queue;
- struct sk_buff_head receive_queue;
struct list_head conn_list;
struct list_head rtx_queue;
struct mptcp_data_frag *first_pending;
struct list_head join_list;
- struct socket *subflow; /* outgoing connect/listener/!mp_capable */
- struct sock *first;
+ struct sock *first; /* The mptcp ops can safely dereference, using suitable
+ * ONCE annotation, the subflow outside the socket
+ * lock as such sock is freed after close().
+ */
struct mptcp_pm_data pm;
+ struct mptcp_sched_ops *sched;
struct {
- u32 space; /* bytes copied in last measurement window */
- u32 copied; /* bytes copied in this measurement window */
+ int space; /* bytes copied in last measurement window */
+ int copied; /* bytes copied in this measurement window */
u64 time; /* start time of measurement window */
u64 rtt_us; /* last maximum rtt of subflows */
} rcvq_space;
+ u8 scaling_ratio;
+ bool allow_subflows;
- u32 setsockopt_seq;
+ u32 subflow_id;
+ u32 setsockopt_seq;
char ca_name[TCP_CA_NAME_MAX];
- struct mptcp_sock *dl_next;
+
+ spinlock_t fallback_lock; /* protects fallback,
+ * allow_infinite_fallback and
+ * allow_join
+ */
+
+ struct list_head backlog_list; /* protected by the data lock */
+ u32 backlog_len;
+ u32 backlog_unaccounted;
};
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
@@ -327,35 +369,56 @@ struct mptcp_sock {
list_for_each_entry(__subflow, &((__msk)->conn_list), node)
#define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \
list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node)
+#define mptcp_next_subflow(__msk, __subflow) \
+ list_next_entry_circular(__subflow, &((__msk)->conn_list), node)
+
+extern struct genl_family mptcp_genl_family;
static inline void msk_owned_by_me(const struct mptcp_sock *msk)
{
sock_owned_by_me((const struct sock *)msk);
}
-static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
+#ifdef CONFIG_DEBUG_NET
+/* MPTCP-specific: we might (indirectly) call this helper with the wrong sk */
+#undef tcp_sk
+#define tcp_sk(ptr) ({ \
+ typeof(ptr) _ptr = (ptr); \
+ WARN_ON(_ptr->sk_protocol != IPPROTO_TCP); \
+ container_of_const(_ptr, struct tcp_sock, inet_conn.icsk_inet.sk); \
+})
+#define mptcp_sk(ptr) ({ \
+ typeof(ptr) _ptr = (ptr); \
+ WARN_ON(_ptr->sk_protocol != IPPROTO_MPTCP); \
+ container_of_const(_ptr, struct mptcp_sock, sk.icsk_inet.sk); \
+})
+
+#else /* !CONFIG_DEBUG_NET */
+#define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk)
+#endif
+
+static inline int mptcp_win_from_space(const struct sock *sk, int space)
{
- return (struct mptcp_sock *)sk;
+ return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space);
}
-/* the msk socket don't use the backlog, also account for the bulk
- * free memory
- */
-static inline int __mptcp_rmem(const struct sock *sk)
+static inline int mptcp_space_from_win(const struct sock *sk, int win)
{
- return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
+ return __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, win);
}
static inline int __mptcp_space(const struct sock *sk)
{
- return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
+ return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
+ READ_ONCE(mptcp_sk(sk)->backlog_len) -
+ sk_rmem_alloc_get(sk));
}
static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
{
const struct mptcp_sock *msk = mptcp_sk(sk);
- return READ_ONCE(msk->first_pending);
+ return msk->first_pending;
}
static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk)
@@ -370,7 +433,7 @@ static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk)
static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk)
{
- struct mptcp_sock *msk = mptcp_sk(sk);
+ const struct mptcp_sock *msk = mptcp_sk(sk);
if (!msk->first_pending)
return NULL;
@@ -381,11 +444,11 @@ static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk)
return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
}
-static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk)
+static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- if (msk->snd_una == READ_ONCE(msk->snd_nxt))
+ if (msk->snd_una == msk->snd_nxt)
return NULL;
return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list);
@@ -403,6 +466,7 @@ struct mptcp_subflow_request_sock {
u16 mp_capable : 1,
mp_join : 1,
backup : 1,
+ request_bkup : 1,
csum_reqd : 1,
allow_join_id0 : 1;
u8 local_id;
@@ -424,21 +488,20 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
return (struct mptcp_subflow_request_sock *)rsk;
}
-enum mptcp_data_avail {
- MPTCP_SUBFLOW_NODATA,
- MPTCP_SUBFLOW_DATA_AVAIL,
-};
-
struct mptcp_delegated_action {
struct napi_struct napi;
+ local_lock_t bh_lock;
struct list_head head;
};
DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
-#define MPTCP_DELEGATE_SEND 0
-#define MPTCP_DELEGATE_ACK 1
+#define MPTCP_DELEGATE_SCHEDULED 0
+#define MPTCP_DELEGATE_SEND 1
+#define MPTCP_DELEGATE_ACK 2
+#define MPTCP_DELEGATE_SNDBUF 3
+#define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED))
/* MPTCP subflow context */
struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */
@@ -450,6 +513,7 @@ struct mptcp_subflow_context {
u64 remote_key;
u64 idsn;
u64 map_seq;
+ u64 rcv_wnd_sent;
u32 snd_isn;
u32 token;
u32 rel_write_seq;
@@ -463,7 +527,6 @@ struct mptcp_subflow_context {
request_bkup : 1,
mp_capable : 1, /* remote is MPTCP capable */
mp_join : 1, /* remote is JOINing */
- fully_established : 1, /* path validated */
pm_notified : 1, /* PM hook called for established status */
conn_finished : 1,
map_valid : 1,
@@ -475,15 +538,20 @@ struct mptcp_subflow_context {
send_mp_fail : 1,
send_fastclose : 1,
send_infinite_map : 1,
- rx_eof : 1,
remote_key_valid : 1, /* received the peer key from */
disposable : 1, /* ctx can be free at ulp release time */
+ closing : 1, /* must not pass rx data to msk anymore */
stale : 1, /* unable to snd/rcv data, do not use for xmit */
- local_id_valid : 1, /* local_id is correctly initialized */
valid_csum_seen : 1, /* at least one csum validated */
is_mptfo : 1, /* subflow is doing TFO */
+ close_event_done : 1, /* has done the post-closed part */
+ mpc_drop : 1, /* the MPC option has been dropped in a rtx */
__unused : 8;
- enum mptcp_data_avail data_avail;
+ bool data_avail;
+ bool scheduled;
+ bool pm_listener; /* a listener managed by the kernel PM? */
+ bool fully_established; /* path validated */
+ u32 lent_mem_frag;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@@ -492,13 +560,15 @@ struct mptcp_subflow_context {
u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */
u64 iasn; /* initial ack sequence number, MPC subflows only */
};
- u8 local_id;
+ s16 local_id; /* if negative not initialized yet */
u8 remote_id;
u8 reset_seen:1;
u8 reset_transient:1;
u8 reset_reason:4;
u8 stale_count;
+ u32 subflow_id;
+
long delegated_status;
unsigned long fail_tout;
@@ -508,6 +578,9 @@ struct mptcp_subflow_context {
u32 setsockopt_seq;
u32 stale_rcv_tstamp;
+ int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf,
+ * protected by the msk socket lock
+ */
struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */
@@ -521,7 +594,7 @@ struct mptcp_subflow_context {
static inline struct mptcp_subflow_context *
mptcp_subflow_ctx(const struct sock *sk)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
/* Use RCU on icsk_ulp_data only for sock diag code */
return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data;
@@ -538,6 +611,80 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow)
{
memset(&subflow->reset, 0, sizeof(subflow->reset));
subflow->request_mptcp = 1;
+ WRITE_ONCE(subflow->local_id, -1);
+}
+
+/* Convert reset reasons in MPTCP to enum sk_rst_reason type */
+static inline enum sk_rst_reason
+sk_rst_convert_mptcp_reason(u32 reason)
+{
+ switch (reason) {
+ case MPTCP_RST_EUNSPEC:
+ return SK_RST_REASON_MPTCP_RST_EUNSPEC;
+ case MPTCP_RST_EMPTCP:
+ return SK_RST_REASON_MPTCP_RST_EMPTCP;
+ case MPTCP_RST_ERESOURCE:
+ return SK_RST_REASON_MPTCP_RST_ERESOURCE;
+ case MPTCP_RST_EPROHIBIT:
+ return SK_RST_REASON_MPTCP_RST_EPROHIBIT;
+ case MPTCP_RST_EWQ2BIG:
+ return SK_RST_REASON_MPTCP_RST_EWQ2BIG;
+ case MPTCP_RST_EBADPERF:
+ return SK_RST_REASON_MPTCP_RST_EBADPERF;
+ case MPTCP_RST_EMIDDLEBOX:
+ return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX;
+ default:
+ /* It should not happen, or else errors may occur
+ * in MPTCP layer
+ */
+ return SK_RST_REASON_ERROR;
+ }
+}
+
+static inline void
+mptcp_send_active_reset_reason(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ enum sk_rst_reason reason;
+
+ reason = sk_rst_convert_mptcp_reason(subflow->reset_reason);
+ tcp_send_active_reset(sk, GFP_ATOMIC, reason);
+}
+
+/* Made the fwd mem carried by the given skb available to the msk,
+ * To be paired with a previous mptcp_subflow_lend_fwdmem() before freeing
+ * the skb or setting the skb ownership.
+ */
+static inline void mptcp_borrow_fwdmem(struct sock *sk, struct sk_buff *skb)
+{
+ struct sock *ssk = skb->sk;
+
+ /* The subflow just lend the skb fwd memory; if the subflow meanwhile
+ * closed, mptcp_close_ssk() already released the ssk rcv memory.
+ */
+ DEBUG_NET_WARN_ON_ONCE(skb->destructor);
+ sk_forward_alloc_add(sk, skb->truesize);
+ if (!ssk)
+ return;
+
+ atomic_sub(skb->truesize, &ssk->sk_rmem_alloc);
+ skb->sk = NULL;
+}
+
+static inline void
+__mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, int size)
+{
+ int frag = (subflow->lent_mem_frag + size) & (PAGE_SIZE - 1);
+
+ subflow->lent_mem_frag = frag;
+}
+
+static inline void
+mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow,
+ struct sk_buff *skb)
+{
+ __mptcp_subflow_lend_fwdmem(subflow, skb->truesize);
+ skb->destructor = NULL;
}
static inline u64
@@ -554,28 +701,31 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
return subflow->map_seq + mptcp_subflow_get_map_offset(subflow);
}
-void mptcp_subflow_process_delegated(struct sock *ssk);
+void mptcp_subflow_process_delegated(struct sock *ssk, long actions);
static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action)
{
+ long old, set_bits = BIT(MPTCP_DELEGATE_SCHEDULED) | BIT(action);
struct mptcp_delegated_action *delegated;
bool schedule;
/* the caller held the subflow bh socket lock */
lockdep_assert_in_softirq();
- /* The implied barrier pairs with mptcp_subflow_delegated_done(), and
- * ensures the below list check sees list updates done prior to status
- * bit changes
+ /* The implied barrier pairs with tcp_release_cb_override()
+ * mptcp_napi_poll(), and ensures the below list check sees list
+ * updates done prior to delegated status bits changes
*/
- if (!test_and_set_bit(action, &subflow->delegated_status)) {
- /* still on delegated list from previous scheduling */
- if (!list_empty(&subflow->delegated_node))
+ old = set_mask_bits(&subflow->delegated_status, 0, set_bits);
+ if (!(old & BIT(MPTCP_DELEGATE_SCHEDULED))) {
+ if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node)))
return;
+ local_lock_nested_bh(&mptcp_delegated_actions.bh_lock);
delegated = this_cpu_ptr(&mptcp_delegated_actions);
schedule = list_empty(&delegated->head);
list_add_tail(&subflow->delegated_node, &delegated->head);
+ local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock);
sock_hold(mptcp_subflow_tcp_sock(subflow));
if (schedule)
napi_schedule(&delegated->napi);
@@ -587,37 +737,39 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated)
{
struct mptcp_subflow_context *ret;
- if (list_empty(&delegated->head))
+ local_lock_nested_bh(&mptcp_delegated_actions.bh_lock);
+ if (list_empty(&delegated->head)) {
+ local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock);
return NULL;
+ }
ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node);
list_del_init(&ret->delegated_node);
+ local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock);
return ret;
}
-static inline bool mptcp_subflow_has_delegated_action(const struct mptcp_subflow_context *subflow)
-{
- return !!READ_ONCE(subflow->delegated_status);
-}
-
-static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *subflow, int action)
-{
- /* pairs with mptcp_subflow_delegate, ensures delegate_node is updated before
- * touching the status bit
- */
- smp_wmb();
- clear_bit(action, &subflow->delegated_status);
-}
+void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp);
+void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk);
int mptcp_is_enabled(const struct net *net);
unsigned int mptcp_get_add_addr_timeout(const struct net *net);
int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
+unsigned int mptcp_close_timeout(const struct sock *sk);
int mptcp_get_pm_type(const struct net *net);
-void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk);
-void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
- const struct mptcp_options_received *mp_opt);
+const char *mptcp_get_path_manager(const struct net *net);
+const char *mptcp_get_scheduler(const struct net *net);
+
+void mptcp_active_disable(struct sock *sk);
+bool mptcp_active_should_disable(struct sock *ssk);
+void mptcp_active_enable(struct sock *sk);
+
+void mptcp_get_available_schedulers(char *buf, size_t maxlen);
+void __mptcp_subflow_fully_established(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
void mptcp_check_and_set_pending(struct sock *sk);
void __mptcp_push_pending(struct sock *sk, unsigned int flags);
@@ -630,22 +782,62 @@ void __mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_reset(struct sock *ssk);
void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk);
void mptcp_sock_graft(struct sock *sk, struct socket *parent);
-struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
+struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk);
bool __mptcp_close(struct sock *sk, long timeout);
void mptcp_cancel_work(struct sock *sk);
-void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk);
+void __mptcp_unaccepted_force_close(struct sock *sk);
+void mptcp_set_state(struct sock *sk, int state);
bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
const struct mptcp_addr_info *b, bool use_port);
+void mptcp_local_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr);
+void mptcp_remote_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr);
/* called with sk socket lock held */
-int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local,
const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
struct socket **new_sock);
void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct sockaddr_storage *addr,
unsigned short family);
+struct mptcp_sched_ops *mptcp_sched_find(const char *name);
+int mptcp_validate_scheduler(struct mptcp_sched_ops *sched);
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_sched_init(void);
+int mptcp_init_sched(struct mptcp_sock *msk,
+ struct mptcp_sched_ops *sched);
+void mptcp_release_sched(struct mptcp_sock *msk);
+void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
+ bool scheduled);
+struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk);
+struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
+int mptcp_sched_get_send(struct mptcp_sock *msk);
+int mptcp_sched_get_retrans(struct mptcp_sock *msk);
+
+static inline u64 mptcp_data_avail(const struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed);
+}
+
+static inline bool mptcp_epollin_ready(const struct sock *sk)
+{
+ u64 data_avail = mptcp_data_avail(mptcp_sk(sk));
+
+ if (!data_avail)
+ return false;
+
+ /* mptcp doesn't have to deal with small skbs in the receive queue,
+ * as it can always coalesce them
+ */
+ return (data_avail >= sk->sk_rcvlowat) ||
+ tcp_under_memory_pressure(sk);
+}
+
+int mptcp_set_rcvlowat(struct sock *sk, int val);
static inline bool __tcp_can_send(const struct sock *ssk)
{
@@ -656,7 +848,7 @@ static inline bool __tcp_can_send(const struct sock *ssk)
static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
- if (subflow->request_join && !subflow->fully_established)
+ if (subflow->request_join && !READ_ONCE(subflow->fully_established))
return false;
return __tcp_can_send(mptcp_subflow_tcp_sock(subflow));
@@ -666,6 +858,8 @@ void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow);
bool mptcp_subflow_active(struct mptcp_subflow_context *subflow);
+void mptcp_subflow_drop_ctx(struct sock *ssk);
+
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
struct mptcp_subflow_context *ctx)
{
@@ -682,20 +876,44 @@ void __init mptcp_proto_init(void);
int __init mptcp_proto_v6_init(void);
#endif
-struct sock *mptcp_sk_clone(const struct sock *sk,
- const struct mptcp_options_received *mp_opt,
- struct request_sock *req);
+struct sock *mptcp_sk_clone_init(const struct sock *sk,
+ const struct mptcp_options_received *mp_opt,
+ struct sock *ssk,
+ struct request_sock *req);
void mptcp_get_options(const struct sk_buff *skb,
struct mptcp_options_received *mp_opt);
void mptcp_finish_connect(struct sock *sk);
-void __mptcp_set_connected(struct sock *sk);
-void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout);
+void __mptcp_sync_state(struct sock *sk, int state);
+void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout);
+
+static inline void mptcp_stop_tout_timer(struct sock *sk)
+{
+ if (!inet_csk(sk)->icsk_mtup.probe_timestamp)
+ return;
+
+ sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer);
+ inet_csk(sk)->icsk_mtup.probe_timestamp = 0;
+}
+
+static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout)
+{
+ /* avoid 0 timestamp, as that means no close timeout */
+ inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1;
+}
+
+static inline void mptcp_start_tout_timer(struct sock *sk)
+{
+ mptcp_set_close_tout(sk, tcp_jiffies32);
+ mptcp_reset_tout_timer(mptcp_sk(sk), 0);
+}
+
static inline bool mptcp_is_fully_established(struct sock *sk)
{
return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
READ_ONCE(mptcp_sk(sk)->fully_established);
}
+
void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
@@ -716,7 +934,6 @@ static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit)
void __mptcp_check_push(struct sock *sk, struct sock *ssk);
void __mptcp_data_acked(struct sock *sk);
void __mptcp_error_report(struct sock *sk);
-void mptcp_subflow_eof(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
{
@@ -724,26 +941,86 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
}
-static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+static inline u32 mptcp_notsent_lowat(const struct sock *sk)
{
- if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf))
- return false;
+ struct net *net = sock_net(sk);
+ u32 val;
- WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
- return true;
+ val = READ_ONCE(mptcp_sk(sk)->notsent_lowat);
+ return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
+}
+
+static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+ u32 notsent_bytes;
+
+ notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt);
+ return (notsent_bytes << wake) < mptcp_notsent_lowat(sk);
+}
+
+static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake)
+{
+ return mptcp_stream_memory_free(sk, wake) &&
+ __sk_stream_is_writeable(sk, wake);
}
static inline void mptcp_write_space(struct sock *sk)
{
- if (sk_stream_is_writeable(sk)) {
- /* pairs with memory barrier in mptcp_poll */
- smp_mb();
- if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags))
- sk_stream_write_space(sk);
+ /* pairs with memory barrier in mptcp_poll */
+ smp_mb();
+ if (mptcp_stream_memory_free(sk, 1))
+ sk_stream_write_space(sk);
+}
+
+static inline void __mptcp_sync_sndbuf(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ int ssk_sndbuf, new_sndbuf;
+
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ return;
+
+ new_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]);
+ mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
+ ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf);
+
+ subflow->cached_sndbuf = ssk_sndbuf;
+ new_sndbuf += ssk_sndbuf;
}
+
+ /* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */
+ WRITE_ONCE(sk->sk_sndbuf, new_sndbuf);
+ mptcp_write_space(sk);
}
-void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags);
+/* The called held both the msk socket and the subflow socket locks,
+ * possibly under BH
+ */
+static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf)
+ __mptcp_sync_sndbuf(sk);
+}
+
+/* the caller held only the subflow socket lock, either in process or
+ * BH context. Additionally this can be called under the msk data lock,
+ * so we can't acquire such lock here: let the delegate action acquires
+ * the needed locks in suitable order.
+ */
+static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf))
+ return;
+
+ local_bh_disable();
+ mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF);
+ local_bh_enable();
+}
#define MPTCP_TOKEN_MAX_RETRIES 4
@@ -755,7 +1032,7 @@ static inline void mptcp_token_init_request(struct request_sock *req)
int mptcp_token_new_request(struct request_sock *req);
void mptcp_token_destroy_request(struct request_sock *req);
-int mptcp_token_new_connect(struct sock *sk);
+int mptcp_token_new_connect(struct sock *ssk);
void mptcp_token_accept(struct mptcp_subflow_request_sock *r,
struct mptcp_sock *msk);
bool mptcp_token_exists(u32 token);
@@ -772,6 +1049,7 @@ __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum su
void __init mptcp_pm_init(void);
void mptcp_pm_data_init(struct mptcp_sock *msk);
void mptcp_pm_data_reset(struct mptcp_sock *msk);
+void mptcp_pm_destroy(struct mptcp_sock *msk);
int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
struct mptcp_addr_info *addr);
int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
@@ -781,63 +1059,65 @@ bool mptcp_pm_addr_families_match(const struct sock *sk,
const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *rem);
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
-void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
-void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp);
+void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk);
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
void mptcp_pm_connection_closed(struct mptcp_sock *msk);
void mptcp_pm_subflow_established(struct mptcp_sock *msk);
bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk);
-void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
+void mptcp_pm_subflow_check_next(struct mptcp_sock *msk,
const struct mptcp_subflow_context *subflow);
void mptcp_pm_add_addr_received(const struct sock *ssk,
const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
-void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk);
+void mptcp_pm_send_ack(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow,
+ bool prio, bool backup);
+void mptcp_pm_addr_send_ack(struct mptcp_sock *msk);
+void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id);
+void mptcp_pm_rm_subflow(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list);
void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list);
void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup);
void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq);
-int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr,
- struct mptcp_addr_info *rem,
- u8 bkup);
+int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ struct mptcp_addr_info *rem,
+ u8 bkup);
bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
- const struct mptcp_pm_addr_entry *entry);
-void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
+ const struct mptcp_addr_info *addr);
bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk);
struct mptcp_pm_add_entry *
mptcp_pm_del_add_timer(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr, bool check_id);
-struct mptcp_pm_add_entry *
-mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr);
-int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
- unsigned int id,
- u8 *flags, int *ifindex);
-int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
- unsigned int id,
- u8 *flags, int *ifindex);
-int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token,
- struct mptcp_pm_addr_entry *loc,
- struct mptcp_pm_addr_entry *rem, u8 bkup);
+bool mptcp_lookup_subflow_by_saddr(const struct list_head *list,
+ const struct mptcp_addr_info *saddr);
+bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local,
+ struct genl_info *info);
+int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local,
+ struct genl_info *info);
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool echo);
int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
-int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
-void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
- struct list_head *rm_list);
-
-int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
- struct mptcp_pm_addr_entry *entry);
-void mptcp_free_local_addr_list(struct mptcp_sock *msk);
-int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info);
-int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info);
-int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info);
-int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info);
+void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry);
+
+/* the default path manager, used in mptcp_pm_unregister */
+extern struct mptcp_pm_ops mptcp_pm_kernel;
+
+struct mptcp_pm_ops *mptcp_pm_find(const char *name);
+int mptcp_pm_register(struct mptcp_pm_ops *pm_ops);
+void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops);
+int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops);
+void mptcp_pm_get_available(char *buf, size_t maxlen);
+
+void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk);
void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
const struct sock *ssk, gfp_t gfp);
@@ -847,10 +1127,11 @@ void mptcp_event_pm_listener(const struct sock *ssk,
enum mptcp_event_type event);
bool mptcp_userspace_pm_active(const struct mptcp_sock *msk);
-void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
- const struct mptcp_options_received *mp_opt);
void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
struct request_sock *req);
+int mptcp_pm_genl_fill_addr(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ struct mptcp_pm_addr_entry *entry);
static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
{
@@ -913,22 +1194,47 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_rm_list *rm_list);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
-int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
+int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *skc);
+int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *skc);
+bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc);
+bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
+bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
+int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
+ struct netlink_callback *cb);
+int mptcp_userspace_pm_dump_addr(struct sk_buff *msg,
+ struct netlink_callback *cb);
+int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr,
+ struct genl_info *info);
+int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr,
+ struct genl_info *info);
+
+static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow)
+{
+ int local_id = READ_ONCE(subflow->local_id);
+ if (local_id < 0)
+ return 0;
+ return local_id;
+}
+
+void __init mptcp_pm_kernel_register(void);
+void __init mptcp_pm_userspace_register(void);
void __init mptcp_pm_nl_init(void);
-void mptcp_pm_nl_work(struct mptcp_sock *msk);
-void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
- const struct mptcp_rm_list *rm_list);
-int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
-unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk);
+void mptcp_pm_worker(struct mptcp_sock *msk);
+void __mptcp_pm_kernel_worker(struct mptcp_sock *msk);
+u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_endp_fullmesh_max(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk);
/* called under PM lock */
static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk)
{
- if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk))
+ if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk))
WRITE_ONCE(msk->pm.accept_subflow, true);
}
@@ -939,7 +1245,14 @@ static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk)
spin_unlock_bh(&msk->pm.lock);
}
-void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk);
+static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.remote_deny_join_id0) &&
+ msk->pm.local_addr_used == 0 &&
+ mptcp_pm_get_limit_add_addr_accepted(msk) == 0 &&
+ msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk);
+}
+
void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk);
static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)
@@ -962,23 +1275,26 @@ static inline bool mptcp_check_fallback(const struct sock *sk)
return __mptcp_check_fallback(msk);
}
-static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
+static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk)
{
- if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) {
- pr_debug("TCP fallback already done (msk=%p)", msk);
- return;
- }
- set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+ struct sock *ssk = READ_ONCE(msk->first);
+
+ return ssk && ((1 << inet_sk_state_load(ssk)) &
+ (TCPF_ESTABLISHED | TCPF_SYN_SENT |
+ TCPF_SYN_RECV | TCPF_LISTEN));
}
-static inline void mptcp_do_fallback(struct sock *ssk)
+bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib);
+
+static inline bool mptcp_try_fallback(struct sock *ssk, int fb_mib)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = subflow->conn;
struct mptcp_sock *msk;
msk = mptcp_sk(sk);
- __mptcp_do_fallback(msk);
+ if (!__mptcp_try_fallback(msk, fb_mib))
+ return false;
if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) {
gfp_t saved_allocation = ssk->sk_allocation;
@@ -990,9 +1306,16 @@ static inline void mptcp_do_fallback(struct sock *ssk)
tcp_shutdown(ssk, SEND_SHUTDOWN);
ssk->sk_allocation = saved_allocation;
}
+ return true;
}
-#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
+static inline void mptcp_early_fallback(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow,
+ int fb_mib)
+{
+ subflow->request_mptcp = 0;
+ WARN_ON_ONCE(!__mptcp_try_fallback(msk, fb_mib));
+}
static inline bool mptcp_check_infinite_map(struct sk_buff *skb)
{
@@ -1014,7 +1337,8 @@ static inline bool subflow_simultaneous_connect(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- return sk->sk_state == TCP_ESTABLISHED &&
+ return (1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) &&
is_active_ssk(subflow) &&
!subflow->conn_finished;
}
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
new file mode 100644
index 000000000000..1e59072d478c
--- /dev/null
+++ b/net/mptcp/sched.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2022, SUSE.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include "protocol.h"
+
+static DEFINE_SPINLOCK(mptcp_sched_list_lock);
+static LIST_HEAD(mptcp_sched_list);
+
+static int mptcp_sched_default_get_send(struct mptcp_sock *msk)
+{
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_get_send(msk);
+ if (!ssk)
+ return -EINVAL;
+
+ mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
+ return 0;
+}
+
+static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk)
+{
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_get_retrans(msk);
+ if (!ssk)
+ return -EINVAL;
+
+ mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
+ return 0;
+}
+
+static struct mptcp_sched_ops mptcp_sched_default = {
+ .get_send = mptcp_sched_default_get_send,
+ .get_retrans = mptcp_sched_default_get_retrans,
+ .name = "default",
+ .owner = THIS_MODULE,
+};
+
+/* Must be called with rcu read lock held */
+struct mptcp_sched_ops *mptcp_sched_find(const char *name)
+{
+ struct mptcp_sched_ops *sched, *ret = NULL;
+
+ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
+ if (!strcmp(sched->name, name)) {
+ ret = sched;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/* Build string with list of available scheduler values.
+ * Similar to tcp_get_available_congestion_control()
+ */
+void mptcp_get_available_schedulers(char *buf, size_t maxlen)
+{
+ struct mptcp_sched_ops *sched;
+ size_t offs = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", sched->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
+ }
+ rcu_read_unlock();
+}
+
+int mptcp_validate_scheduler(struct mptcp_sched_ops *sched)
+{
+ if (!sched->get_send) {
+ pr_err("%s does not implement required ops\n", sched->name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
+{
+ int ret;
+
+ ret = mptcp_validate_scheduler(sched);
+ if (ret)
+ return ret;
+
+ spin_lock(&mptcp_sched_list_lock);
+ if (mptcp_sched_find(sched->name)) {
+ spin_unlock(&mptcp_sched_list_lock);
+ return -EEXIST;
+ }
+ list_add_tail_rcu(&sched->list, &mptcp_sched_list);
+ spin_unlock(&mptcp_sched_list_lock);
+
+ pr_debug("%s registered\n", sched->name);
+ return 0;
+}
+
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
+{
+ if (sched == &mptcp_sched_default)
+ return;
+
+ spin_lock(&mptcp_sched_list_lock);
+ list_del_rcu(&sched->list);
+ spin_unlock(&mptcp_sched_list_lock);
+}
+
+void mptcp_sched_init(void)
+{
+ mptcp_register_scheduler(&mptcp_sched_default);
+}
+
+int mptcp_init_sched(struct mptcp_sock *msk,
+ struct mptcp_sched_ops *sched)
+{
+ if (!sched)
+ sched = &mptcp_sched_default;
+
+ if (!bpf_try_module_get(sched, sched->owner))
+ return -EBUSY;
+
+ msk->sched = sched;
+ if (msk->sched->init)
+ msk->sched->init(msk);
+
+ pr_debug("sched=%s\n", msk->sched->name);
+
+ return 0;
+}
+
+void mptcp_release_sched(struct mptcp_sock *msk)
+{
+ struct mptcp_sched_ops *sched = msk->sched;
+
+ if (!sched)
+ return;
+
+ msk->sched = NULL;
+ if (sched->release)
+ sched->release(msk);
+
+ bpf_module_put(sched, sched->owner);
+}
+
+void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
+ bool scheduled)
+{
+ WRITE_ONCE(subflow->scheduled, scheduled);
+}
+
+int mptcp_sched_get_send(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ msk_owned_by_me(msk);
+
+ /* the following check is moved out of mptcp_subflow_get_send */
+ if (__mptcp_check_fallback(msk)) {
+ if (msk->first &&
+ __tcp_can_send(msk->first) &&
+ sk_stream_memory_free(msk->first)) {
+ mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true);
+ return 0;
+ }
+ return -EINVAL;
+ }
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled))
+ return 0;
+ }
+
+ if (msk->sched == &mptcp_sched_default || !msk->sched)
+ return mptcp_sched_default_get_send(msk);
+ return msk->sched->get_send(msk);
+}
+
+int mptcp_sched_get_retrans(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ msk_owned_by_me(msk);
+
+ /* the following check is moved out of mptcp_subflow_get_retrans */
+ if (__mptcp_check_fallback(msk))
+ return -EINVAL;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled))
+ return 0;
+ }
+
+ if (msk->sched == &mptcp_sched_default || !msk->sched)
+ return mptcp_sched_default_get_retrans(msk);
+ if (msk->sched->get_retrans)
+ return msk->sched->get_retrans(msk);
+ return msk->sched->get_send(msk);
+}
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index d4b1e6ec1b36..de90a2897d2d 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -14,11 +14,12 @@
#include <net/mptcp.h>
#include "protocol.h"
-#define MIN_INFO_OPTLEN_SIZE 16
+#define MIN_INFO_OPTLEN_SIZE 16
+#define MIN_FULL_INFO_OPTLEN_SIZE 40
static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
{
- sock_owned_by_me((const struct sock *)msk);
+ msk_owned_by_me(msk);
if (likely(!__mptcp_check_fallback(msk)))
return NULL;
@@ -88,12 +89,13 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in
sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val);
break;
case SO_PRIORITY:
- ssk->sk_priority = val;
+ WRITE_ONCE(ssk->sk_priority, val);
break;
case SO_SNDBUF:
case SO_SNDBUFFORCE:
ssk->sk_userlocks |= SOCK_SNDBUF_LOCK;
WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
+ mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
break;
case SO_RCVBUF:
case SO_RCVBUFFORCE:
@@ -102,7 +104,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in
break;
case SO_MARK:
if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) {
- ssk->sk_mark = sk->sk_mark;
+ WRITE_ONCE(ssk->sk_mark, sk->sk_mark);
sk_dst_reset(ssk);
}
break;
@@ -179,8 +181,6 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
switch (optname) {
case SO_KEEPALIVE:
- mptcp_sol_socket_sync_intval(msk, optname, val);
- return 0;
case SO_DEBUG:
case SO_MARK:
case SO_PRIORITY:
@@ -291,7 +291,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
sockptr_t optval, unsigned int optlen)
{
struct sock *sk = (struct sock *)msk;
- struct socket *ssock;
+ struct sock *ssk;
int ret;
switch (optname) {
@@ -300,22 +300,22 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
case SO_BINDTODEVICE:
case SO_BINDTOIFINDEX:
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
release_sock(sk);
- return -EINVAL;
+ return PTR_ERR(ssk);
}
- ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
+ ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen);
if (ret == 0) {
if (optname == SO_REUSEPORT)
- sk->sk_reuseport = ssock->sk->sk_reuseport;
+ sk->sk_reuseport = ssk->sk_reuseport;
else if (optname == SO_REUSEADDR)
- sk->sk_reuse = ssock->sk->sk_reuse;
+ sk->sk_reuse = ssk->sk_reuse;
else if (optname == SO_BINDTODEVICE)
- sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if;
+ sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
else if (optname == SO_BINDTOIFINDEX)
- sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if;
+ sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
}
release_sock(sk);
return ret;
@@ -355,6 +355,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
case SO_BROADCAST:
case SO_BSDCOMPAT:
case SO_PASSCRED:
+ case SO_PASSPIDFD:
case SO_PASSSEC:
case SO_RXQ_OVFL:
case SO_WIFI_STATUS:
@@ -388,20 +389,20 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
{
struct sock *sk = (struct sock *)msk;
int ret = -EOPNOTSUPP;
- struct socket *ssock;
+ struct sock *ssk;
switch (optname) {
case IPV6_V6ONLY:
case IPV6_TRANSPARENT:
case IPV6_FREEBIND:
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
release_sock(sk);
- return -EINVAL;
+ return PTR_ERR(ssk);
}
- ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
+ ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen);
if (ret != 0) {
release_sock(sk);
return ret;
@@ -411,13 +412,15 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
switch (optname) {
case IPV6_V6ONLY:
- sk->sk_ipv6only = ssock->sk->sk_ipv6only;
+ sk->sk_ipv6only = ssk->sk_ipv6only;
break;
case IPV6_TRANSPARENT:
- inet_sk(sk)->transparent = inet_sk(ssock->sk)->transparent;
+ inet_assign_bit(TRANSPARENT, sk,
+ inet_test_bit(TRANSPARENT, ssk));
break;
case IPV6_FREEBIND:
- inet_sk(sk)->freebind = inet_sk(ssock->sk)->freebind;
+ inet_assign_bit(FREEBIND, sk,
+ inet_test_bit(FREEBIND, ssk));
break;
}
@@ -435,6 +438,8 @@ static bool mptcp_supported_sockopt(int level, int optname)
/* should work fine */
case IP_FREEBIND:
case IP_TRANSPARENT:
+ case IP_BIND_ADDRESS_NO_PORT:
+ case IP_LOCAL_PORT_RANGE:
/* the following are control cmsg related */
case IP_PKTINFO:
@@ -450,7 +455,6 @@ static bool mptcp_supported_sockopt(int level, int optname)
/* common stuff that need some love */
case IP_TOS:
case IP_TTL:
- case IP_BIND_ADDRESS_NO_PORT:
case IP_MTU_DISCOVER:
case IP_RECVERR:
@@ -612,26 +616,42 @@ static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t
}
if (ret == 0)
- strcpy(msk->ca_name, name);
+ strscpy(msk->ca_name, name, sizeof(msk->ca_name));
release_sock(sk);
return ret;
}
-static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval,
- unsigned int optlen)
+static int __mptcp_setsockopt_set_val(struct mptcp_sock *msk, int max,
+ int (*set_val)(struct sock *, int),
+ int *msk_val, int val)
{
struct mptcp_subflow_context *subflow;
- struct sock *sk = (struct sock *)msk;
- int val;
+ int err = 0;
- if (optlen < sizeof(int))
- return -EINVAL;
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int ret;
- if (copy_from_sockptr(&val, optval, sizeof(val)))
- return -EFAULT;
+ lock_sock(ssk);
+ ret = set_val(ssk, val);
+ err = err ? : ret;
+ release_sock(ssk);
+ }
+
+ if (!err) {
+ *msk_val = val;
+ sockopt_seq_inc(msk);
+ }
+
+ return err;
+}
+
+static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
- lock_sock(sk);
sockopt_seq_inc(msk);
msk->cork = !!val;
mptcp_for_each_subflow(msk, subflow) {
@@ -643,25 +663,15 @@ static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optva
}
if (!val)
mptcp_check_and_set_pending(sk);
- release_sock(sk);
return 0;
}
-static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval,
- unsigned int optlen)
+static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
- int val;
- if (optlen < sizeof(int))
- return -EINVAL;
-
- if (copy_from_sockptr(&val, optval, sizeof(val)))
- return -EFAULT;
-
- lock_sock(sk);
sockopt_seq_inc(msk);
msk->nodelay = !!val;
mptcp_for_each_subflow(msk, subflow) {
@@ -673,17 +683,14 @@ static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t op
}
if (val)
mptcp_check_and_set_pending(sk);
- release_sock(sk);
-
return 0;
}
-static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int optname,
- sockptr_t optval, unsigned int optlen)
+static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = (struct sock *)msk;
- struct inet_sock *issk;
- struct socket *ssock;
+ struct sock *ssk;
int err;
err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
@@ -692,20 +699,27 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
release_sock(sk);
- return -EINVAL;
+ return PTR_ERR(ssk);
}
- issk = inet_sk(ssock->sk);
-
switch (optname) {
case IP_FREEBIND:
- issk->freebind = inet_sk(sk)->freebind;
+ inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
break;
case IP_TRANSPARENT:
- issk->transparent = inet_sk(sk)->transparent;
+ inet_assign_bit(TRANSPARENT, ssk,
+ inet_test_bit(TRANSPARENT, sk));
+ break;
+ case IP_BIND_ADDRESS_NO_PORT:
+ inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk,
+ inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
+ break;
+ case IP_LOCAL_PORT_RANGE:
+ WRITE_ONCE(inet_sk(ssk)->local_port_range,
+ READ_ONCE(inet_sk(sk)->local_port_range));
break;
default:
release_sock(sk);
@@ -732,11 +746,14 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
lock_sock(sk);
sockopt_seq_inc(msk);
- val = inet_sk(sk)->tos;
+ val = READ_ONCE(inet_sk(sk)->tos);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow;
+ slow = lock_sock_fast(ssk);
__ip_sock_set_tos(ssk, val);
+ unlock_sock_fast(ssk, slow);
}
release_sock(sk);
@@ -749,7 +766,9 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
switch (optname) {
case IP_FREEBIND:
case IP_TRANSPARENT:
- return mptcp_setsockopt_sol_ip_set_transparent(msk, optname, optval, optlen);
+ case IP_BIND_ADDRESS_NO_PORT:
+ case IP_LOCAL_PORT_RANGE:
+ return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen);
case IP_TOS:
return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen);
}
@@ -760,14 +779,40 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
- struct socket *sock;
+ struct sock *sk = (struct sock *)msk;
+ struct sock *ssk;
+ int ret;
/* Limit to first subflow, before the connection establishment */
- sock = __mptcp_nmpc_socket(msk);
- if (!sock)
- return -EINVAL;
+ lock_sock(sk);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ ret = PTR_ERR(ssk);
+ goto unlock;
+ }
+
+ ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
- return tcp_setsockopt(sock->sk, level, optname, optval, optlen);
+unlock:
+ release_sock(sk);
+ return ret;
+}
+
+static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level,
+ int optname, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ int ret = 0;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
+ if (ret)
+ break;
+ }
+ return ret;
}
static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
@@ -777,25 +822,10 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
int ret, val;
switch (optname) {
- case TCP_INQ:
- ret = mptcp_get_int_option(msk, optval, optlen, &val);
- if (ret)
- return ret;
- if (val < 0 || val > 1)
- return -EINVAL;
-
- lock_sock(sk);
- msk->recvmsg_inq = !!val;
- release_sock(sk);
- return 0;
case TCP_ULP:
return -EOPNOTSUPP;
case TCP_CONGESTION:
return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
- case TCP_CORK:
- return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen);
- case TCP_NODELAY:
- return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen);
case TCP_DEFER_ACCEPT:
/* See tcp.c: TCP_DEFER_ACCEPT does not fail */
mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
@@ -808,7 +838,55 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
optval, optlen);
}
- return -EOPNOTSUPP;
+ ret = mptcp_get_int_option(msk, optval, optlen, &val);
+ if (ret)
+ return ret;
+
+ lock_sock(sk);
+ switch (optname) {
+ case TCP_INQ:
+ if (val < 0 || val > 1)
+ ret = -EINVAL;
+ else
+ msk->recvmsg_inq = !!val;
+ break;
+ case TCP_NOTSENT_LOWAT:
+ WRITE_ONCE(msk->notsent_lowat, val);
+ mptcp_write_space(sk);
+ break;
+ case TCP_CORK:
+ ret = __mptcp_setsockopt_sol_tcp_cork(msk, val);
+ break;
+ case TCP_NODELAY:
+ ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val);
+ break;
+ case TCP_KEEPIDLE:
+ ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPIDLE,
+ &tcp_sock_set_keepidle_locked,
+ &msk->keepalive_idle, val);
+ break;
+ case TCP_KEEPINTVL:
+ ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPINTVL,
+ &tcp_sock_set_keepintvl,
+ &msk->keepalive_intvl, val);
+ break;
+ case TCP_KEEPCNT:
+ ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPCNT,
+ &tcp_sock_set_keepcnt,
+ &msk->keepalive_cnt,
+ val);
+ break;
+ case TCP_MAXSEG:
+ msk->maxseg = val;
+ ret = mptcp_setsockopt_all_sf(msk, SOL_TCP, optname, optval,
+ optlen);
+ break;
+ default:
+ ret = -ENOPROTOOPT;
+ }
+
+ release_sock(sk);
+ return ret;
}
int mptcp_setsockopt(struct sock *sk, int level, int optname,
@@ -817,7 +895,7 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname,
struct mptcp_sock *msk = mptcp_sk(sk);
struct sock *ssk;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
if (level == SOL_SOCKET)
return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
@@ -853,22 +931,22 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
char __user *optval, int __user *optlen)
{
struct sock *sk = (struct sock *)msk;
- struct socket *ssock;
- int ret = -EINVAL;
struct sock *ssk;
+ int ret;
lock_sock(sk);
ssk = msk->first;
- if (ssk) {
- ret = tcp_getsockopt(ssk, level, optname, optval, optlen);
- goto out;
- }
+ if (ssk)
+ goto get;
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock)
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ ret = PTR_ERR(ssk);
goto out;
+ }
- ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen);
+get:
+ ret = tcp_getsockopt(ssk, level, optname, optval, optlen);
out:
release_sock(sk);
@@ -877,31 +955,64 @@ out:
void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
{
+ struct sock *sk = (struct sock *)msk;
u32 flags = 0;
- u8 val;
+ bool slow;
+ u32 now;
memset(info, 0, sizeof(*info));
- info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
+ info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows);
info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
- info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk);
- val = mptcp_pm_get_add_addr_signal_max(msk);
- info->mptcpi_add_addr_signal_max = val;
- val = mptcp_pm_get_add_addr_accept_max(msk);
- info->mptcpi_add_addr_accepted_max = val;
- info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk);
- if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
+
+ if (inet_sk_state_load(sk) == TCP_LISTEN)
+ return;
+
+ /* The following limits only make sense for the in-kernel PM */
+ if (mptcp_pm_is_kernel(msk)) {
+ info->mptcpi_limit_extra_subflows =
+ mptcp_pm_get_limit_extra_subflows(msk);
+ info->mptcpi_endp_signal_max =
+ mptcp_pm_get_endp_signal_max(msk);
+ info->mptcpi_limit_add_addr_accepted =
+ mptcp_pm_get_limit_add_addr_accepted(msk);
+ info->mptcpi_endp_subflow_max =
+ mptcp_pm_get_endp_subflow_max(msk);
+ info->mptcpi_endp_laminar_max =
+ mptcp_pm_get_endp_laminar_max(msk);
+ info->mptcpi_endp_fullmesh_max =
+ mptcp_pm_get_endp_fullmesh_max(msk);
+ }
+
+ if (__mptcp_check_fallback(msk))
flags |= MPTCP_INFO_FLAG_FALLBACK;
if (READ_ONCE(msk->can_ack))
flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
info->mptcpi_flags = flags;
- info->mptcpi_token = READ_ONCE(msk->token);
- info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
- info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
- info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
+
+ slow = lock_sock_fast(sk);
info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
+ info->mptcpi_token = msk->token;
+ info->mptcpi_write_seq = msk->write_seq;
+ info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
+ info->mptcpi_bytes_sent = msk->bytes_sent;
+ info->mptcpi_bytes_received = msk->bytes_received;
+ info->mptcpi_bytes_retrans = msk->bytes_retrans;
+ info->mptcpi_subflows_total = info->mptcpi_extra_subflows +
+ __mptcp_has_initial_subflow(msk);
+ now = tcp_jiffies32;
+ info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent);
+ info->mptcpi_last_data_recv = jiffies_to_msecs(now - msk->last_data_recv);
+ unlock_sock_fast(sk, slow);
+
+ mptcp_data_lock(sk);
+ info->mptcpi_last_ack_recv = jiffies_to_msecs(now - msk->last_ack_recv);
+ info->mptcpi_snd_una = msk->snd_una;
+ info->mptcpi_rcv_nxt = msk->ack_seq;
+ info->mptcpi_bytes_acked = msk->bytes_acked;
+ mptcp_data_unlock(sk);
}
EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
@@ -913,6 +1024,10 @@ static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, in
if (get_user(len, optlen))
return -EFAULT;
+ /* When used only to check if a fallback to TCP happened. */
+ if (len == 0)
+ return 0;
+
len = min_t(unsigned int, len, sizeof(struct mptcp_info));
mptcp_diag_fill_info(msk, &m_info);
@@ -948,7 +1063,8 @@ static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd,
}
static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd,
- char __user *optval, int __user *optlen)
+ char __user *optval,
+ int __user *optlen)
{
int len, copylen;
@@ -1039,7 +1155,7 @@ static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval,
static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
memset(a, 0, sizeof(*a));
@@ -1129,6 +1245,125 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o
return 0;
}
+static int mptcp_get_full_info(struct mptcp_full_info *mfi,
+ char __user *optval,
+ int __user *optlen)
+{
+ int len;
+
+ BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) !=
+ MIN_FULL_INFO_OPTLEN_SIZE);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < MIN_FULL_INFO_OPTLEN_SIZE)
+ return -EINVAL;
+
+ memset(mfi, 0, sizeof(*mfi));
+ if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE))
+ return -EFAULT;
+
+ if (mfi->size_tcpinfo_kernel ||
+ mfi->size_sfinfo_kernel ||
+ mfi->num_subflows)
+ return -EINVAL;
+
+ if (mfi->size_sfinfo_user > INT_MAX ||
+ mfi->size_tcpinfo_user > INT_MAX)
+ return -EINVAL;
+
+ return len - MIN_FULL_INFO_OPTLEN_SIZE;
+}
+
+static int mptcp_put_full_info(struct mptcp_full_info *mfi,
+ char __user *optval,
+ u32 copylen,
+ int __user *optlen)
+{
+ copylen += MIN_FULL_INFO_OPTLEN_SIZE;
+ if (put_user(copylen, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, mfi, copylen))
+ return -EFAULT;
+ return 0;
+}
+
+static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval,
+ int __user *optlen)
+{
+ unsigned int sfcount = 0, copylen = 0;
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ void __user *tcpinfoptr, *sfinfoptr;
+ struct mptcp_full_info mfi;
+ int len;
+
+ len = mptcp_get_full_info(&mfi, optval, optlen);
+ if (len < 0)
+ return len;
+
+ /* don't bother filling the mptcp info if there is not enough
+ * user-space-provided storage
+ */
+ if (len > 0) {
+ mptcp_diag_fill_info(msk, &mfi.mptcp_info);
+ copylen += min_t(unsigned int, len, sizeof(struct mptcp_info));
+ }
+
+ mfi.size_tcpinfo_kernel = sizeof(struct tcp_info);
+ mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user,
+ sizeof(struct tcp_info));
+ sfinfoptr = u64_to_user_ptr(mfi.subflow_info);
+ mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info);
+ mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user,
+ sizeof(struct mptcp_subflow_info));
+ tcpinfoptr = u64_to_user_ptr(mfi.tcp_info);
+
+ lock_sock(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct mptcp_subflow_info sfinfo;
+ struct tcp_info tcp_info;
+
+ if (sfcount++ >= mfi.size_arrays_user)
+ continue;
+
+ /* fetch addr/tcp_info only if the user space buffers
+ * are wide enough
+ */
+ memset(&sfinfo, 0, sizeof(sfinfo));
+ sfinfo.id = subflow->subflow_id;
+ if (mfi.size_sfinfo_user >
+ offsetof(struct mptcp_subflow_info, addrs))
+ mptcp_get_sub_addrs(ssk, &sfinfo.addrs);
+ if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user))
+ goto fail_release;
+
+ if (mfi.size_tcpinfo_user) {
+ tcp_get_info(ssk, &tcp_info);
+ if (copy_to_user(tcpinfoptr, &tcp_info,
+ mfi.size_tcpinfo_user))
+ goto fail_release;
+ }
+
+ tcpinfoptr += mfi.size_tcpinfo_user;
+ sfinfoptr += mfi.size_sfinfo_user;
+ }
+ release_sock(sk);
+
+ mfi.num_subflows = sfcount;
+ if (mptcp_put_full_info(&mfi, optval, copylen, optlen))
+ return -EFAULT;
+
+ return 0;
+
+fail_release:
+ release_sock(sk);
+ return -EFAULT;
+}
+
static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
int __user *optlen, int val)
{
@@ -1161,6 +1396,8 @@ static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
char __user *optval, int __user *optlen)
{
+ struct sock *sk = (void *)msk;
+
switch (optname) {
case TCP_ULP:
case TCP_CONGESTION:
@@ -1179,6 +1416,25 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
return mptcp_put_int_option(msk, optval, optlen, msk->cork);
case TCP_NODELAY:
return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
+ case TCP_KEEPIDLE:
+ return mptcp_put_int_option(msk, optval, optlen,
+ msk->keepalive_idle ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_time) / HZ);
+ case TCP_KEEPINTVL:
+ return mptcp_put_int_option(msk, optval, optlen,
+ msk->keepalive_intvl ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_intvl) / HZ);
+ case TCP_KEEPCNT:
+ return mptcp_put_int_option(msk, optval, optlen,
+ msk->keepalive_cnt ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_probes));
+ case TCP_NOTSENT_LOWAT:
+ return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
+ case TCP_IS_MPTCP:
+ return mptcp_put_int_option(msk, optval, optlen, 1);
+ case TCP_MAXSEG:
+ return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
+ optval, optlen);
}
return -EOPNOTSUPP;
}
@@ -1190,7 +1446,39 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
switch (optname) {
case IP_TOS:
- return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos);
+ return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
+ case IP_FREEBIND:
+ return mptcp_put_int_option(msk, optval, optlen,
+ inet_test_bit(FREEBIND, sk));
+ case IP_TRANSPARENT:
+ return mptcp_put_int_option(msk, optval, optlen,
+ inet_test_bit(TRANSPARENT, sk));
+ case IP_BIND_ADDRESS_NO_PORT:
+ return mptcp_put_int_option(msk, optval, optlen,
+ inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
+ case IP_LOCAL_PORT_RANGE:
+ return mptcp_put_int_option(msk, optval, optlen,
+ READ_ONCE(inet_sk(sk)->local_port_range));
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = (void *)msk;
+
+ switch (optname) {
+ case IPV6_V6ONLY:
+ return mptcp_put_int_option(msk, optval, optlen,
+ sk->sk_ipv6only);
+ case IPV6_TRANSPARENT:
+ return mptcp_put_int_option(msk, optval, optlen,
+ inet_test_bit(TRANSPARENT, sk));
+ case IPV6_FREEBIND:
+ return mptcp_put_int_option(msk, optval, optlen,
+ inet_test_bit(FREEBIND, sk));
}
return -EOPNOTSUPP;
@@ -1202,6 +1490,8 @@ static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname,
switch (optname) {
case MPTCP_INFO:
return mptcp_getsockopt_info(msk, optval, optlen);
+ case MPTCP_FULL_INFO:
+ return mptcp_getsockopt_full_info(msk, optval, optlen);
case MPTCP_TCPINFO:
return mptcp_getsockopt_tcpinfo(msk, optval, optlen);
case MPTCP_SUBFLOW_ADDRS:
@@ -1217,7 +1507,7 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname,
struct mptcp_sock *msk = mptcp_sk(sk);
struct sock *ssk;
- pr_debug("msk=%p", msk);
+ pr_debug("msk=%p\n", msk);
/* @@ the meaning of setsockopt() when the socket is connected and
* there are multiple subflows is not yet defined. It is up to the
@@ -1233,6 +1523,8 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname,
if (level == SOL_IP)
return mptcp_getsockopt_v4(msk, optname, optval, option);
+ if (level == SOL_IPV6)
+ return mptcp_getsockopt_v6(msk, optname, optval, option);
if (level == SOL_TCP)
return mptcp_getsockopt_sol_tcp(msk, optname, optval, option);
if (level == SOL_MPTCP)
@@ -1244,23 +1536,25 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
{
static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK;
struct sock *sk = (struct sock *)msk;
+ bool keep_open;
- if (ssk->sk_prot->keepalive) {
- if (sock_flag(sk, SOCK_KEEPOPEN))
- ssk->sk_prot->keepalive(ssk, 1);
- else
- ssk->sk_prot->keepalive(ssk, 0);
- }
+ keep_open = sock_flag(sk, SOCK_KEEPOPEN);
+ if (ssk->sk_prot->keepalive)
+ ssk->sk_prot->keepalive(ssk, keep_open);
+ sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open);
ssk->sk_priority = sk->sk_priority;
ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
+ ssk->sk_ipv6only = sk->sk_ipv6only;
__ip_sock_set_tos(ssk, inet_sk(sk)->tos);
if (sk->sk_userlocks & tx_rx_locks) {
ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
- if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) {
WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
+ mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
+ }
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
}
@@ -1283,42 +1577,78 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
tcp_set_congestion_control(ssk, msk->ca_name, false, true);
__tcp_sock_set_cork(ssk, !!msk->cork);
__tcp_sock_set_nodelay(ssk, !!msk->nodelay);
-
- inet_sk(ssk)->transparent = inet_sk(sk)->transparent;
- inet_sk(ssk)->freebind = inet_sk(sk)->freebind;
-}
-
-static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
-{
- bool slow = lock_sock_fast(ssk);
-
- sync_socket_options(msk, ssk);
-
- unlock_sock_fast(ssk, slow);
+ tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle);
+ tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl);
+ tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt);
+ tcp_sock_set_maxseg(ssk, msk->maxseg);
+
+ inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk));
+ inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
+ inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
+ WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range));
}
-void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
+void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
msk_owned_by_me(msk);
+ ssk->sk_rcvlowat = 0;
+
+ /* subflows must ignore any latency-related settings: will not affect
+ * the user-space - only the msk is relevant - but will foul the
+ * mptcp scheduler
+ */
+ tcp_sk(ssk)->notsent_lowat = UINT_MAX;
+
if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
- __mptcp_sockopt_sync(msk, ssk);
+ sync_socket_options(msk, ssk);
subflow->setsockopt_seq = msk->setsockopt_seq;
}
}
-void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
+/* unfortunately this is different enough from the tcp version so
+ * that we can't factor it out
+ */
+int mptcp_set_rcvlowat(struct sock *sk, int val)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_subflow_context *subflow;
+ int space, cap;
- msk_owned_by_me(msk);
+ /* bpf can land here with a wrong sk type */
+ if (sk->sk_protocol == IPPROTO_TCP)
+ return -EINVAL;
- if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
- sync_socket_options(msk, ssk);
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+ cap = sk->sk_rcvbuf >> 1;
+ else
+ cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
+ val = min(val, cap);
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
- subflow->setsockopt_seq = msk->setsockopt_seq;
+ /* Check if we need to signal EPOLLIN right now */
+ if (mptcp_epollin_ready(sk))
+ sk->sk_data_ready(sk);
+
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+ return 0;
+
+ space = mptcp_space_from_win(sk, val);
+ if (space <= sk->sk_rcvbuf)
+ return 0;
+
+ /* propagate the rcvbuf changes to all the subflows */
+ WRITE_ONCE(sk->sk_rcvbuf, space);
+ mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow;
+
+ slow = lock_sock_fast(ssk);
+ WRITE_ONCE(ssk->sk_rcvbuf, space);
+ WRITE_ONCE(tcp_sk(ssk)->window_clamp, val);
+ unlock_sock_fast(ssk, slow);
}
+ return 0;
}
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ec54413fb31f..86ce58ae533d 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -9,23 +9,23 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netdevice.h>
-#include <crypto/algapi.h>
#include <crypto/sha2.h>
+#include <crypto/utils.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
#include <net/protocol.h>
-#include <net/tcp.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/ip6_route.h>
#include <net/transp_v6.h>
#endif
#include <net/mptcp.h>
-#include <uapi/linux/mptcp.h>
+
#include "protocol.h"
#include "mib.h"
#include <trace/events/mptcp.h>
+#include <trace/events/sock.h>
static void mptcp_subflow_ops_undo_override(struct sock *ssk);
@@ -39,7 +39,7 @@ static void subflow_req_destructor(struct request_sock *req)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
- pr_debug("subflow_req=%p", subflow_req);
+ pr_debug("subflow_req=%p\n", subflow_req);
if (subflow_req->msk)
sock_put((struct sock *)subflow_req->msk);
@@ -74,7 +74,8 @@ static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_
get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
- subflow_generate_hmac(msk->local_key, msk->remote_key,
+ subflow_generate_hmac(READ_ONCE(msk->local_key),
+ READ_ONCE(msk->remote_key),
subflow_req->local_nonce,
subflow_req->remote_nonce, hmac);
@@ -99,6 +100,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
return NULL;
}
subflow_req->local_id = local_id;
+ subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req);
return msk;
}
@@ -130,6 +132,13 @@ static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
}
}
+static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb)
+{
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT);
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
+ return -EPERM;
+}
+
/* Init mptcp request socket.
*
* Returns an error code if a JOIN has failed and a TCP reset
@@ -144,27 +153,36 @@ static int subflow_check_req(struct request_sock *req,
struct mptcp_options_received mp_opt;
bool opt_mp_capable, opt_mp_join;
- pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
+ pr_debug("subflow_req=%p, listener=%p\n", subflow_req, listener);
#ifdef CONFIG_TCP_MD5SIG
/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
* TCP option space.
*/
- if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
+ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) {
+ subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
return -EINVAL;
+ }
#endif
mptcp_get_options(skb, &mp_opt);
- opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
- opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
+ opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN);
+ opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN);
if (opt_mp_capable) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
+ if (unlikely(listener->pm_listener))
+ return subflow_reset_req_endp(req, skb);
if (opt_mp_join)
return 0;
} else if (opt_mp_join) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
+
+ if (mp_opt.backup)
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX);
+ } else if (unlikely(listener->pm_listener)) {
+ return subflow_reset_req_endp(req, skb);
}
if (opt_mp_capable && listener->request_mptcp) {
@@ -214,11 +232,12 @@ again:
}
if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
- pr_debug("syn inet_sport=%d %d",
+ pr_debug("syn inet_sport=%d %d\n",
ntohs(inet_sk(sk_listener)->inet_sport),
ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
return -EPERM;
}
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
@@ -227,13 +246,16 @@ again:
subflow_req_create_thmac(subflow_req);
if (unlikely(req->syncookie)) {
- if (mptcp_can_accept_new_subflow(subflow_req->msk))
- subflow_init_req_cookie_join_save(subflow_req, skb);
- else
+ if (!mptcp_can_accept_new_subflow(subflow_req->msk)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED);
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
return -EPERM;
+ }
+
+ subflow_init_req_cookie_join_save(subflow_req, skb);
}
- pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
+ pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req->token,
subflow_req->remote_nonce, subflow_req->msk);
}
@@ -253,8 +275,8 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
subflow_init_req(req, sk_listener);
mptcp_get_options(skb, &mp_opt);
- opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
- opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
+ opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK);
+ opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK);
if (opt_mp_capable && opt_mp_join)
return -EINVAL;
@@ -281,10 +303,21 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
}
EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
+static enum sk_rst_reason mptcp_get_rst_reason(const struct sk_buff *skb)
+{
+ const struct mptcp_ext *mpext = mptcp_get_ext(skb);
+
+ if (!mpext)
+ return SK_RST_REASON_NOT_SPECIFIED;
+
+ return sk_rst_convert_mptcp_reason(mpext->reset_reason);
+}
+
static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
struct sk_buff *skb,
struct flowi *fl,
- struct request_sock *req)
+ struct request_sock *req,
+ u32 tw_isn)
{
struct dst_entry *dst;
int err;
@@ -292,7 +325,7 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
tcp_rsk(req)->is_mptcp = 1;
subflow_init_req(req, sk);
- dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
+ dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn);
if (!dst)
return NULL;
@@ -302,7 +335,8 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
dst_release(dst);
if (!req->syncookie)
- tcp_request_sock_ops.send_reset(sk, skb);
+ tcp_request_sock_ops.send_reset(sk, skb,
+ mptcp_get_rst_reason(skb));
return NULL;
}
@@ -351,7 +385,8 @@ static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
struct sk_buff *skb,
struct flowi *fl,
- struct request_sock *req)
+ struct request_sock *req,
+ u32 tw_isn)
{
struct dst_entry *dst;
int err;
@@ -359,7 +394,7 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
tcp_rsk(req)->is_mptcp = 1;
subflow_init_req(req, sk);
- dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
+ dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn);
if (!dst)
return NULL;
@@ -369,7 +404,8 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
dst_release(dst);
if (!req->syncookie)
- tcp6_request_sock_ops.send_reset(sk, skb);
+ tcp6_request_sock_ops.send_reset(sk, skb,
+ mptcp_get_rst_reason(skb));
return NULL;
}
#endif
@@ -396,15 +432,19 @@ void mptcp_subflow_reset(struct sock *ssk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = subflow->conn;
+ /* mptcp_mp_fail_no_response() can reach here on an already closed
+ * socket
+ */
+ if (ssk->sk_state == TCP_CLOSE)
+ return;
+
/* must hold: tcp_done() could drop last reference on parent */
sock_hold(sk);
- tcp_set_state(ssk, TCP_CLOSE);
- tcp_send_active_reset(ssk, GFP_ATOMIC);
+ mptcp_send_active_reset_reason(ssk);
tcp_done(ssk);
- if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
- schedule_work(&mptcp_sk(sk)->work))
- return; /* worker will put sk for us */
+ if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags))
+ mptcp_schedule_work(sk);
sock_put(sk);
}
@@ -414,24 +454,28 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc
return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport;
}
-void __mptcp_set_connected(struct sock *sk)
+void __mptcp_sync_state(struct sock *sk, int state)
{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sock *ssk = msk->first;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ __mptcp_propagate_sndbuf(sk, ssk);
+ if (!msk->rcvspace_init)
+ mptcp_rcv_space_init(msk, ssk);
+
if (sk->sk_state == TCP_SYN_SENT) {
- inet_sk_state_store(sk, TCP_ESTABLISHED);
+ /* subflow->idsn is always available is TCP_SYN_SENT state,
+ * even for the FASTOPEN scenarios
+ */
+ WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
+ WRITE_ONCE(msk->snd_nxt, msk->write_seq);
+ mptcp_set_state(sk, state);
sk->sk_state_change(sk);
}
}
-static void mptcp_set_connected(struct sock *sk)
-{
- mptcp_data_lock(sk);
- if (!sock_owned_by_user(sk))
- __mptcp_set_connected(sk);
- else
- __set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->cb_flags);
- mptcp_data_unlock(sk);
-}
-
static void subflow_set_remote_key(struct mptcp_sock *msk,
struct mptcp_subflow_context *subflow,
const struct mptcp_options_received *mp_opt)
@@ -447,12 +491,40 @@ static void subflow_set_remote_key(struct mptcp_sock *msk,
mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn);
subflow->iasn++;
+ /* for fallback's sake */
+ subflow->map_seq = subflow->iasn;
+
WRITE_ONCE(msk->remote_key, subflow->remote_key);
WRITE_ONCE(msk->ack_seq, subflow->iasn);
WRITE_ONCE(msk->can_ack, true);
atomic64_set(&msk->rcv_wnd_sent, subflow->iasn);
}
+static void mptcp_propagate_state(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_data_lock(sk);
+ if (mp_opt) {
+ /* Options are available only in the non fallback cases
+ * avoid updating rx path fields otherwise
+ */
+ WRITE_ONCE(msk->snd_una, subflow->idsn + 1);
+ WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd);
+ subflow_set_remote_key(msk, subflow, mp_opt);
+ }
+
+ if (!sock_owned_by_user(sk)) {
+ __mptcp_sync_state(sk, ssk->sk_state);
+ } else {
+ msk->pending_state = ssk->sk_state;
+ __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags);
+ }
+ mptcp_data_unlock(sk);
+}
+
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -467,19 +539,21 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
return;
msk = mptcp_sk(parent);
- mptcp_propagate_sndbuf(parent, sk);
subflow->rel_write_seq = 1;
subflow->conn_finished = 1;
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
- pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
+ pr_debug("subflow=%p synack seq=%x\n", subflow, subflow->ssn_offset);
mptcp_get_options(skb, &mp_opt);
if (subflow->request_mptcp) {
- if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
- MPTCP_INC_STATS(sock_net(sk),
- MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
- mptcp_do_fallback(sk);
- pr_fallback(msk);
+ if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) {
+ if (!mptcp_try_fallback(sk,
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK)) {
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_FALLBACKFAILED);
+ goto do_reset;
+ }
+
goto fallback;
}
@@ -488,14 +562,14 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
if (mp_opt.deny_join_id0)
WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
subflow->mp_capable = 1;
- subflow_set_remote_key(msk, subflow, &mp_opt);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
mptcp_finish_connect(sk);
- mptcp_set_connected(parent);
+ mptcp_active_enable(parent);
+ mptcp_propagate_state(parent, sk, subflow, &mp_opt);
} else if (subflow->request_join) {
u8 hmac[SHA256_DIGEST_SIZE];
- if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ)) {
+ if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) {
subflow->reset_reason = MPTCP_RST_EMPTCP;
goto do_reset;
}
@@ -503,8 +577,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->backup = mp_opt.backup;
subflow->thmac = mp_opt.thmac;
subflow->remote_nonce = mp_opt.nonce;
- subflow->remote_id = mp_opt.join_id;
- pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d",
+ WRITE_ONCE(subflow->remote_id, mp_opt.join_id);
+ pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n",
subflow, subflow->thmac, subflow->remote_nonce,
subflow->backup);
@@ -526,16 +600,21 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->mp_join = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
+ if (subflow->backup)
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX);
+
if (subflow_use_different_dport(msk, sk)) {
- pr_debug("synack inet_dport=%d %d",
+ pr_debug("synack inet_dport=%d %d\n",
ntohs(inet_sk(sk)->inet_dport),
ntohs(inet_sk(parent)->inet_dport));
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
}
} else if (mptcp_check_fallback(sk)) {
+ /* It looks like MPTCP is blocked, while TCP is not */
+ if (subflow->mpc_drop)
+ mptcp_active_disable(parent);
fallback:
- mptcp_rcv_space_init(msk, sk);
- mptcp_set_connected(parent);
+ mptcp_propagate_state(parent, sk, subflow, NULL);
}
return;
@@ -546,8 +625,8 @@ do_reset:
static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id)
{
- subflow->local_id = local_id;
- subflow->local_id_valid = 1;
+ WARN_ON_ONCE(local_id < 0 || local_id > 255);
+ WRITE_ONCE(subflow->local_id, local_id);
}
static int subflow_chk_local_id(struct sock *sk)
@@ -556,7 +635,7 @@ static int subflow_chk_local_id(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
int err;
- if (likely(subflow->local_id_valid))
+ if (likely(subflow->local_id >= 0))
return 0;
err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
@@ -564,6 +643,8 @@ static int subflow_chk_local_id(struct sock *sk)
return err;
subflow_set_local_id(subflow, err);
+ subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk);
+
return 0;
}
@@ -596,7 +677,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- pr_debug("subflow=%p", subflow);
+ pr_debug("subflow=%p\n", subflow);
/* Never answer to SYNs sent to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -621,13 +702,13 @@ static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init
static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init;
static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init;
static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init;
-static struct proto tcpv6_prot_override;
+static struct proto tcpv6_prot_override __ro_after_init;
static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- pr_debug("subflow=%p", subflow);
+ pr_debug("subflow=%p\n", subflow);
if (skb->protocol == htons(ETH_P_IP))
return subflow_v4_conn_request(sk, skb);
@@ -671,32 +752,20 @@ struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *op
EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc);
/* validate hmac received in third ACK */
-static bool subflow_hmac_valid(const struct request_sock *req,
+static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req,
const struct mptcp_options_received *mp_opt)
{
- const struct mptcp_subflow_request_sock *subflow_req;
+ struct mptcp_sock *msk = subflow_req->msk;
u8 hmac[SHA256_DIGEST_SIZE];
- struct mptcp_sock *msk;
-
- subflow_req = mptcp_subflow_rsk(req);
- msk = subflow_req->msk;
- if (!msk)
- return false;
- subflow_generate_hmac(msk->remote_key, msk->local_key,
+ subflow_generate_hmac(READ_ONCE(msk->remote_key),
+ READ_ONCE(msk->local_key),
subflow_req->remote_nonce,
subflow_req->local_nonce, hmac);
return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
}
-static void mptcp_force_close(struct sock *sk)
-{
- /* the msk is not yet exposed to user-space */
- inet_sk_state_store(sk, TCP_CLOSE);
- sk_common_release(sk);
-}
-
static void subflow_ulp_fallback(struct sock *sk,
struct mptcp_subflow_context *old_ctx)
{
@@ -710,31 +779,30 @@ static void subflow_ulp_fallback(struct sock *sk,
mptcp_subflow_ops_undo_override(sk);
}
-static void subflow_drop_ctx(struct sock *ssk)
+void mptcp_subflow_drop_ctx(struct sock *ssk)
{
struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
if (!ctx)
return;
- subflow_ulp_fallback(ssk, ctx);
- if (ctx->conn)
- sock_put(ctx->conn);
+ list_del(&mptcp_subflow_ctx(ssk)->node);
+ if (inet_csk(ssk)->icsk_ulp_ops) {
+ subflow_ulp_fallback(ssk, ctx);
+ if (ctx->conn)
+ sock_put(ctx->conn);
+ }
kfree_rcu(ctx, rcu);
}
-void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
- const struct mptcp_options_received *mp_opt)
+void __mptcp_subflow_fully_established(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt)
{
- struct mptcp_sock *msk = mptcp_sk(subflow->conn);
-
subflow_set_remote_key(msk, subflow, mp_opt);
- subflow->fully_established = 1;
+ WRITE_ONCE(subflow->fully_established, true);
WRITE_ONCE(msk->fully_established, true);
-
- if (subflow->is_mptfo)
- mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt);
}
static struct sock *subflow_syn_recv_sock(const struct sock *sk,
@@ -748,10 +816,11 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct mptcp_subflow_request_sock *subflow_req;
struct mptcp_options_received mp_opt;
bool fallback, fallback_is_fatal;
- struct sock *new_msk = NULL;
+ enum sk_rst_reason reason;
+ struct mptcp_sock *owner;
struct sock *child;
- pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
+ pr_debug("listener=%p, req=%p, conn=%p\n", listener, req, listener->conn);
/* After child creation we must look for MPC even when options
* are not parsed
@@ -776,22 +845,14 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
* options.
*/
mptcp_get_options(skb, &mp_opt);
- if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
+ if (!(mp_opt.suboptions &
+ (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK)))
fallback = true;
- goto create_child;
- }
- new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
- if (!new_msk)
- fallback = true;
} else if (subflow_req->mp_join) {
mptcp_get_options(skb, &mp_opt);
- if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) ||
- !subflow_hmac_valid(req, &mp_opt) ||
- !mptcp_can_accept_new_subflow(subflow_req->msk)) {
- SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
+ if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK))
fallback = true;
- }
}
create_child:
@@ -812,85 +873,81 @@ create_child:
subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
goto dispose_child;
}
-
- if (new_msk)
- mptcp_copy_inaddrs(new_msk, child);
- subflow_drop_ctx(child);
- goto out;
+ goto fallback;
}
/* ssk inherits options of listener sk */
ctx->setsockopt_seq = listener->setsockopt_seq;
if (ctx->mp_capable) {
- /* this can't race with mptcp_close(), as the msk is
- * not yet exposted to user-space
- */
- inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
+ ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req);
+ if (!ctx->conn)
+ goto fallback;
- /* record the newly created socket as the first msk
- * subflow, but don't link it yet into conn_list
- */
- WRITE_ONCE(mptcp_sk(new_msk)->first, child);
+ ctx->subflow_id = 1;
+ owner = mptcp_sk(ctx->conn);
- /* new mpc subflow takes ownership of the newly
- * created mptcp socket
- */
- mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq;
- mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
- mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
- ctx->conn = new_msk;
- new_msk = NULL;
-
- /* set msk addresses early to ensure mptcp_pm_get_local_id()
- * uses the correct data
- */
- mptcp_copy_inaddrs(ctx->conn, child);
+ if (mp_opt.deny_join_id0)
+ WRITE_ONCE(owner->pm.remote_deny_join_id0, true);
+
+ mptcp_pm_new_connection(owner, child, 1);
/* with OoO packets we can reach here without ingress
* mpc option
*/
- if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK)
- mptcp_subflow_fully_established(ctx, &mp_opt);
+ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) {
+ mptcp_pm_fully_established(owner, child);
+ ctx->pm_notified = 1;
+ }
} else if (ctx->mp_join) {
- struct mptcp_sock *owner;
-
owner = subflow_req->msk;
if (!owner) {
subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
goto dispose_child;
}
+ if (!subflow_hmac_valid(subflow_req, &mp_opt)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
+ goto dispose_child;
+ }
+
+ if (!mptcp_can_accept_new_subflow(owner)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED);
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
+ goto dispose_child;
+ }
+
/* move the msk reference ownership to the subflow */
subflow_req->msk = NULL;
ctx->conn = (struct sock *)owner;
if (subflow_use_different_sport(owner, sk)) {
- pr_debug("ack inet_sport=%d %d",
+ pr_debug("ack inet_sport=%d %d\n",
ntohs(inet_sk(sk)->inet_sport),
ntohs(inet_sk((struct sock *)owner)->inet_sport));
if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
goto dispose_child;
}
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
}
- if (!mptcp_finish_join(child))
+ if (!mptcp_finish_join(child)) {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(child);
+
+ subflow_add_reset_reason(skb, subflow->reset_reason);
goto dispose_child;
+ }
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
tcp_rsk(req)->drop_req = true;
}
}
-out:
- /* dispose of the left over mptcp master, if any */
- if (unlikely(new_msk))
- mptcp_force_close(new_msk);
-
/* check for expected invariant - should never trigger, just help
- * catching eariler subtle bugs
+ * catching earlier subtle bugs
*/
WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
(!mptcp_subflow_ctx(child) ||
@@ -898,18 +955,25 @@ out:
return child;
dispose_child:
- subflow_drop_ctx(child);
+ mptcp_subflow_drop_ctx(child);
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
- req->rsk_ops->send_reset(sk, skb);
+ reason = mptcp_get_rst_reason(skb);
+ req->rsk_ops->send_reset(sk, skb, reason);
/* The last child reference will be released by the caller */
return child;
+
+fallback:
+ if (fallback)
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
+ mptcp_subflow_drop_ctx(child);
+ return child;
}
static struct inet_connection_sock_af_ops subflow_specific __ro_after_init;
-static struct proto tcp_prot_override;
+static struct proto tcp_prot_override __ro_after_init;
enum mapping_status {
MAPPING_OK,
@@ -917,12 +981,13 @@ enum mapping_status {
MAPPING_EMPTY,
MAPPING_DATA_FIN,
MAPPING_DUMMY,
- MAPPING_BAD_CSUM
+ MAPPING_BAD_CSUM,
+ MAPPING_NODSS
};
static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
{
- pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
+ pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n",
ssn, subflow->map_subflow_seq, subflow->map_data_len);
}
@@ -932,8 +997,10 @@ static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
unsigned int skb_consumed;
skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
- if (WARN_ON_ONCE(skb_consumed >= skb->len))
+ if (unlikely(skb_consumed >= skb->len)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return true;
+ }
return skb->len - skb_consumed <= subflow->map_data_len -
mptcp_subflow_get_map_offset(subflow);
@@ -1072,8 +1139,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
return MAPPING_EMPTY;
}
+ /* If the required DSS has likely been dropped by a middlebox */
if (!subflow->map_valid)
- return MAPPING_INVALID;
+ return MAPPING_NODSS;
goto validate_seq;
}
@@ -1082,17 +1150,18 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
data_len = mpext->data_len;
if (data_len == 0) {
- pr_debug("infinite mapping received");
+ pr_debug("infinite mapping received\n");
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
- subflow->map_data_len = 0;
return MAPPING_INVALID;
}
if (mpext->data_fin == 1) {
+ u64 data_fin_seq;
+
if (data_len == 1) {
bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
mpext->dsn64);
- pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
+ pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq);
if (subflow->map_valid) {
/* A DATA_FIN might arrive in a DSS
* option before the previous mapping
@@ -1101,26 +1170,26 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
*/
skb_ext_del(skb, SKB_EXT_MPTCP);
return MAPPING_OK;
- } else {
- if (updated && schedule_work(&msk->work))
- sock_hold((struct sock *)msk);
-
- return MAPPING_DATA_FIN;
}
- } else {
- u64 data_fin_seq = mpext->data_seq + data_len - 1;
- /* If mpext->data_seq is a 32-bit value, data_fin_seq
- * must also be limited to 32 bits.
- */
- if (!mpext->dsn64)
- data_fin_seq &= GENMASK_ULL(31, 0);
+ if (updated)
+ mptcp_schedule_work((struct sock *)msk);
- mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
- pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
- data_fin_seq, mpext->dsn64);
+ return MAPPING_DATA_FIN;
}
+ data_fin_seq = mpext->data_seq + data_len - 1;
+
+ /* If mpext->data_seq is a 32-bit value, data_fin_seq must also
+ * be limited to 32 bits.
+ */
+ if (!mpext->dsn64)
+ data_fin_seq &= GENMASK_ULL(31, 0);
+
+ mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
+ pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n",
+ data_fin_seq, mpext->dsn64);
+
/* Adjust for DATA_FIN using 1 byte of sequence space */
data_len--;
}
@@ -1164,7 +1233,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
if (unlikely(subflow->map_csum_reqd != csum_reqd))
return MAPPING_INVALID;
- pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
+ pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n",
subflow->map_seq, subflow->map_subflow_seq,
subflow->map_data_len, subflow->map_csum_reqd,
subflow->map_data_csum);
@@ -1189,62 +1258,83 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
- u32 incr;
+ struct tcp_sock *tp = tcp_sk(ssk);
+ u32 offset, incr, avail_len;
+
+ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
+ if (WARN_ON_ONCE(offset > skb->len))
+ goto out;
- incr = limit >= skb->len ? skb->len + fin : limit;
+ avail_len = skb->len - offset;
+ incr = limit >= avail_len ? avail_len + fin : limit;
- pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
- subflow->map_subflow_seq);
+ pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr, skb->len,
+ offset, subflow->map_subflow_seq);
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
tcp_sk(ssk)->copied_seq += incr;
+
+out:
if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
sk_eat_skb(ssk, skb);
if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
subflow->map_valid = 0;
}
-/* sched mptcp worker to remove the subflow if no more data is pending */
+static bool subflow_is_done(const struct sock *sk)
+{
+ return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
+}
+
+/* sched mptcp worker for subflow cleanup if no more data is pending */
static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
{
+ const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = (struct sock *)msk;
- if (likely(ssk->sk_state != TCP_CLOSE))
+ if (likely(ssk->sk_state != TCP_CLOSE &&
+ (ssk->sk_state != TCP_CLOSE_WAIT ||
+ inet_sk_state_load(sk) != TCP_ESTABLISHED)))
return;
- if (skb_queue_empty(&ssk->sk_receive_queue) &&
- !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) {
- sock_hold(sk);
- if (!schedule_work(&msk->work))
- sock_put(sk);
- }
-}
+ if (!skb_queue_empty(&ssk->sk_receive_queue))
+ return;
-static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
-{
- struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+ mptcp_schedule_work(sk);
- if (subflow->mp_join)
- return false;
- else if (READ_ONCE(msk->csum_enabled))
- return !subflow->valid_csum_seen;
- else
- return !subflow->fully_established;
+ /* when the fallback subflow closes the rx side, trigger a 'dummy'
+ * ingress data fin, so that the msk state will follow along
+ */
+ if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) &&
+ msk->first == ssk &&
+ mptcp_update_rcv_data_fin(msk, subflow->map_seq +
+ subflow->map_data_len, true))
+ mptcp_schedule_work(sk);
}
-static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
+static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
unsigned long fail_tout;
- /* greceful failure can happen only on the MPC subflow */
+ /* we are really failing, prevent any later subflow join */
+ spin_lock_bh(&msk->fallback_lock);
+ if (!msk->allow_infinite_fallback) {
+ spin_unlock_bh(&msk->fallback_lock);
+ return false;
+ }
+ msk->allow_subflows = false;
+ spin_unlock_bh(&msk->fallback_lock);
+
+ /* graceful failure can happen only on the MPC subflow */
if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
- return;
+ return false;
/* since the close timeout take precedence on the fail one,
* no need to start the latter when the first is already set
*/
if (sock_flag((struct sock *)msk, SOCK_DEAD))
- return;
+ return true;
/* we don't need extreme accuracy here, use a zero fail_tout as special
* value meaning no fail timeout at all;
@@ -1255,7 +1345,8 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
WRITE_ONCE(subflow->fail_tout, fail_tout);
tcp_send_ack(ssk);
- mptcp_reset_timeout(msk, subflow->fail_tout);
+ mptcp_reset_tout_timer(msk, subflow->fail_tout);
+ return true;
}
static bool subflow_check_data_avail(struct sock *ssk)
@@ -1266,7 +1357,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
struct sk_buff *skb;
if (!skb_peek(&ssk->sk_receive_queue))
- WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
+ WRITE_ONCE(subflow->data_avail, false);
if (subflow->data_avail)
return true;
@@ -1278,7 +1369,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
status = get_mapping_status(ssk, msk);
trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue));
if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY ||
- status == MAPPING_BAD_CSUM))
+ status == MAPPING_BAD_CSUM || status == MAPPING_NODSS))
goto fallback;
if (status != MAPPING_OK)
@@ -1293,14 +1384,14 @@ static bool subflow_check_data_avail(struct sock *ssk)
old_ack = READ_ONCE(msk->ack_seq);
ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
- pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
+ pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack,
ack_seq);
if (unlikely(before64(ack_seq, old_ack))) {
mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
continue;
}
- WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
+ WRITE_ONCE(subflow->data_avail, true);
break;
}
return true;
@@ -1316,42 +1407,44 @@ fallback:
(subflow->mp_join || subflow->valid_csum_seen)) {
subflow->send_mp_fail = 1;
- if (!READ_ONCE(msk->allow_infinite_fallback)) {
+ if (!mptcp_subflow_fail(msk, ssk)) {
subflow->reset_transient = 0;
subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
goto reset;
}
- mptcp_subflow_fail(msk, ssk);
- WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
+ WRITE_ONCE(subflow->data_avail, true);
return true;
}
- if (!subflow_can_fallback(subflow) && subflow->map_data_len) {
+ if (!mptcp_try_fallback(ssk, MPTCP_MIB_DSSFALLBACK)) {
/* fatal protocol error, close the socket.
* subflow_error_report() will introduce the appropriate barriers
*/
subflow->reset_transient = 0;
- subflow->reset_reason = MPTCP_RST_EMPTCP;
+ subflow->reset_reason = status == MAPPING_NODSS ?
+ MPTCP_RST_EMIDDLEBOX :
+ MPTCP_RST_EMPTCP;
reset:
- ssk->sk_err = EBADMSG;
+ WRITE_ONCE(ssk->sk_err, EBADMSG);
tcp_set_state(ssk, TCP_CLOSE);
while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb);
- tcp_send_active_reset(ssk, GFP_ATOMIC);
- WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
+ mptcp_send_active_reset_reason(ssk);
+ WRITE_ONCE(subflow->data_avail, false);
return false;
}
-
- mptcp_do_fallback(ssk);
}
skb = skb_peek(&ssk->sk_receive_queue);
subflow->map_valid = 1;
- subflow->map_seq = READ_ONCE(msk->ack_seq);
subflow->map_data_len = skb->len;
subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
- WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
+ subflow->map_seq = __mptcp_expand_seq(subflow->map_seq,
+ subflow->iasn +
+ TCP_SKB_CB(skb)->seq -
+ subflow->ssn_offset - 1);
+ WRITE_ONCE(subflow->data_avail, true);
return true;
}
@@ -1363,9 +1456,9 @@ bool mptcp_subflow_data_available(struct sock *sk)
if (subflow->map_valid &&
mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
subflow->map_valid = 0;
- WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
+ WRITE_ONCE(subflow->data_avail, false);
- pr_debug("Done with mapping: seq=%u data_len=%u",
+ pr_debug("Done with mapping: seq=%u data_len=%u\n",
subflow->map_subflow_seq,
subflow->map_data_len);
}
@@ -1388,41 +1481,20 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
const struct sock *sk = subflow->conn;
*space = __mptcp_space(sk);
- *full_space = tcp_full_space(sk);
-}
-
-void __mptcp_error_report(struct sock *sk)
-{
- struct mptcp_subflow_context *subflow;
- struct mptcp_sock *msk = mptcp_sk(sk);
-
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- int err = sock_error(ssk);
-
- if (!err)
- continue;
-
- /* only propagate errors on fallen-back sockets or
- * on MPC connect
- */
- if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
- continue;
-
- inet_sk_state_store(sk, inet_sk_state_load(ssk));
- sk->sk_err = -err;
-
- /* This barrier is coupled with smp_rmb() in mptcp_poll() */
- smp_wmb();
- sk_error_report(sk);
- break;
- }
+ *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}
static void subflow_error_report(struct sock *ssk)
{
struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
+ /* bail early if this is a no-op, so that we avoid introducing a
+ * problematic lockdep dependency between TCP accept queue lock
+ * and msk socket spinlock
+ */
+ if (!sk->sk_socket)
+ return;
+
mptcp_data_lock(sk);
if (!sock_owned_by_user(sk))
__mptcp_error_report(sk);
@@ -1438,6 +1510,8 @@ static void subflow_data_ready(struct sock *sk)
struct sock *parent = subflow->conn;
struct mptcp_sock *msk;
+ trace_sk_data_ready(sk);
+
msk = mptcp_sk(parent);
if (state & TCPF_LISTEN) {
/* MPJ subflow are removed from accept queue before reaching here,
@@ -1453,10 +1527,18 @@ static void subflow_data_ready(struct sock *sk)
WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
!subflow->mp_join && !(state & TCPF_CLOSE));
- if (mptcp_subflow_data_available(sk))
+ if (mptcp_subflow_data_available(sk)) {
mptcp_data_ready(parent, sk);
- else if (unlikely(sk->sk_err))
+
+ /* subflow-level lowat test are not relevant.
+ * respect the msk-level threshold eventually mandating an immediate ack
+ */
+ if (mptcp_data_avail(msk) < parent->sk_rcvlowat &&
+ (tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ } else if (unlikely(sk->sk_err)) {
subflow_error_report(sk);
+ }
}
static void subflow_write_space(struct sock *ssk)
@@ -1486,7 +1568,7 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
- pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
+ pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n",
subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
if (likely(icsk->icsk_af_ops == target))
@@ -1528,28 +1610,31 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
#endif
}
-int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local,
const struct mptcp_addr_info *remote)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_subflow_context *subflow;
+ int local_id = local->addr.id;
struct sockaddr_storage addr;
int remote_id = remote->id;
- int local_id = loc->id;
int err = -ENOTCONN;
struct socket *sf;
struct sock *ssk;
u32 remote_token;
int addrlen;
- int ifindex;
- u8 flags;
+ /* The userspace PM sent the request too early? */
if (!mptcp_is_fully_established(sk))
goto err_out;
- err = mptcp_subflow_create_socket(sk, loc->family, &sf);
- if (err)
+ err = mptcp_subflow_create_socket(sk, local->addr.family, &sf);
+ if (err) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCREATSKERR);
+ pr_debug("msk=%p local=%d remote=%d create sock error: %d\n",
+ msk, local_id, remote_id, err);
goto err_out;
+ }
ssk = sf->sk;
subflow = mptcp_subflow_ctx(ssk);
@@ -1557,48 +1642,66 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
get_random_bytes(&subflow->local_nonce, sizeof(u32));
} while (!subflow->local_nonce);
- if (local_id)
+ /* if 'IPADDRANY', the ID will be set later, after the routing */
+ if (local->addr.family == AF_INET) {
+ if (!local->addr.addr.s_addr)
+ local_id = -1;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ if (ipv6_addr_any(&local->addr.addr6))
+ local_id = -1;
+#endif
+ }
+
+ if (local_id >= 0)
subflow_set_local_id(subflow, local_id);
- mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
- &flags, &ifindex);
subflow->remote_key_valid = 1;
- subflow->remote_key = msk->remote_key;
- subflow->local_key = msk->local_key;
+ subflow->remote_key = READ_ONCE(msk->remote_key);
+ subflow->local_key = READ_ONCE(msk->local_key);
subflow->token = msk->token;
- mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
+ mptcp_info2sockaddr(&local->addr, &addr, ssk->sk_family);
addrlen = sizeof(struct sockaddr_in);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (addr.ss_family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
- mptcp_sockopt_sync(msk, ssk);
-
- ssk->sk_bound_dev_if = ifindex;
- err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
- if (err)
+ ssk->sk_bound_dev_if = local->ifindex;
+ err = kernel_bind(sf, (struct sockaddr_unsized *)&addr, addrlen);
+ if (err) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR);
+ pr_debug("msk=%p local=%d remote=%d bind error: %d\n",
+ msk, local_id, remote_id, err);
goto failed;
+ }
mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
- pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
+ pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk,
remote_token, local_id, remote_id);
subflow->remote_token = remote_token;
- subflow->remote_id = remote_id;
+ WRITE_ONCE(subflow->remote_id, remote_id);
subflow->request_join = 1;
- subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ subflow->request_bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ subflow->subflow_id = msk->subflow_id++;
mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
sock_hold(ssk);
list_add_tail(&subflow->node, &msk->conn_list);
- err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
- if (err && err != -EINPROGRESS)
+ err = kernel_connect(sf, (struct sockaddr_unsized *)&addr, addrlen, O_NONBLOCK);
+ if (err && err != -EINPROGRESS) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR);
+ pr_debug("msk=%p local=%d remote=%d connect error: %d\n",
+ msk, local_id, remote_id, err);
goto failed_unlink;
+ }
+
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTX);
/* discard the subflow socket */
mptcp_sock_graft(ssk, sk->sk_socket);
iput(SOCK_INODE(sf));
- WRITE_ONCE(msk->allow_infinite_fallback, false);
+ mptcp_stop_tout_timer(sk);
return 0;
failed_unlink:
@@ -1617,30 +1720,39 @@ err_out:
return err;
}
-static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
+void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp)
+{
+ /* Only if the msk has been accepted already (and not orphaned).*/
+ if (!mem_cgroup_sockets_enabled || !sk->sk_socket)
+ return;
+
+ mem_cgroup_sk_inherit(sk, ssk);
+ __sk_charge(ssk, gfp);
+}
+
+void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk)
{
#ifdef CONFIG_SOCK_CGROUP_DATA
- struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
- *child_skcd = &child->sk_cgrp_data;
+ struct sock_cgroup_data *sk_cd = &sk->sk_cgrp_data,
+ *ssk_cd = &ssk->sk_cgrp_data;
/* only the additional subflows created by kworkers have to be modified */
- if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
- cgroup_id(sock_cgroup_ptr(child_skcd))) {
-#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = parent->sk_memcg;
-
- mem_cgroup_sk_free(child);
- if (memcg && css_tryget(&memcg->css))
- child->sk_memcg = memcg;
-#endif /* CONFIG_MEMCG */
-
- cgroup_sk_free(child_skcd);
- *child_skcd = *parent_skcd;
- cgroup_sk_clone(child_skcd);
+ if (cgroup_id(sock_cgroup_ptr(sk_cd)) !=
+ cgroup_id(sock_cgroup_ptr(ssk_cd))) {
+ cgroup_sk_free(ssk_cd);
+ *ssk_cd = *sk_cd;
+ cgroup_sk_clone(sk_cd);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
}
+static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
+{
+ __mptcp_inherit_cgrp_data(parent, child);
+ if (mem_cgroup_sockets_enabled)
+ mem_cgroup_sk_inherit(parent, child);
+}
+
static void mptcp_subflow_ops_override(struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -1679,7 +1791,11 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
if (err)
return err;
- lock_sock(sf->sk);
+ lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING);
+
+ err = security_mptcp_add_subflow(sk, sf->sk);
+ if (err)
+ goto err_free;
/* the newly created socket has to be in the same cgroup as its parent */
mptcp_attach_cgroup(sk, sf->sk);
@@ -1688,19 +1804,15 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
* needs it.
* Update ns_tracker to current stack trace and refcounted tracker.
*/
- __netns_tracker_free(net, &sf->sk->ns_tracker, false);
- sf->sk->sk_net_refcnt = 1;
- get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
- sock_inuse_add(net, 1);
+ sk_net_refcnt_upgrade(sf->sk);
err = tcp_set_ulp(sf->sk, "mptcp");
- release_sock(sf->sk);
+ if (err)
+ goto err_free;
- if (err) {
- sock_release(sf);
- return err;
- }
+ mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk);
+ release_sock(sf->sk);
- /* the newly created socket really belongs to the owning MPTCP master
+ /* the newly created socket really belongs to the owning MPTCP
* socket, even if for additional subflows the allocation is performed
* by a kernel workqueue. Adjust inode references, so that the
* procfs/diag interfaces really show this one belonging to the correct
@@ -1711,7 +1823,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
subflow = mptcp_subflow_ctx(sf->sk);
- pr_debug("subflow=%p", subflow);
+ pr_debug("subflow=%p\n", subflow);
*new_sock = sf;
sock_hold(sk);
@@ -1719,6 +1831,11 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
mptcp_subflow_ops_override(sf->sk);
return 0;
+
+err_free:
+ release_sock(sf->sk);
+ sock_release(sf);
+ return err;
}
static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
@@ -1735,9 +1852,10 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
INIT_LIST_HEAD(&ctx->node);
INIT_LIST_HEAD(&ctx->delegated_node);
- pr_debug("subflow=%p", ctx);
+ pr_debug("subflow=%p\n", ctx);
ctx->tcp_sock = sk;
+ WRITE_ONCE(ctx->local_id, -1);
return ctx;
}
@@ -1753,11 +1871,6 @@ static void __subflow_state_change(struct sock *sk)
rcu_read_unlock();
}
-static bool subflow_is_done(const struct sock *sk)
-{
- return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
-}
-
static void subflow_state_change(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -1766,12 +1879,9 @@ static void subflow_state_change(struct sock *sk)
__subflow_state_change(sk);
if (subflow_simultaneous_connect(sk)) {
- mptcp_propagate_sndbuf(parent, sk);
- mptcp_do_fallback(sk);
- mptcp_rcv_space_init(mptcp_sk(parent), sk);
- pr_fallback(mptcp_sk(parent));
+ WARN_ON_ONCE(!mptcp_try_fallback(sk, MPTCP_MIB_SIMULTCONNFALLBACK));
subflow->conn_finished = 1;
- mptcp_set_connected(parent);
+ mptcp_propagate_state(parent, sk, subflow, NULL);
}
/* as recvmsg() does not acquire the subflow socket for ssk selection
@@ -1784,43 +1894,26 @@ static void subflow_state_change(struct sock *sk)
subflow_error_report(sk);
subflow_sched_work_if_closed(mptcp_sk(parent), sk);
-
- if (__mptcp_check_fallback(mptcp_sk(parent)) &&
- !subflow->rx_eof && subflow_is_done(sk)) {
- subflow->rx_eof = 1;
- mptcp_subflow_eof(parent);
- }
}
void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
{
struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
- struct mptcp_sock *msk, *next, *head = NULL;
- struct request_sock *req;
+ struct request_sock *req, *head, *tail;
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk, *ssk;
- /* build a list of all unaccepted mptcp sockets */
+ /* Due to lock dependencies no relevant lock can be acquired under rskq_lock.
+ * Splice the req list, so that accept() can not reach the pending ssk after
+ * the listener socket is released below.
+ */
spin_lock_bh(&queue->rskq_lock);
- for (req = queue->rskq_accept_head; req; req = req->dl_next) {
- struct mptcp_subflow_context *subflow;
- struct sock *ssk = req->sk;
- struct mptcp_sock *msk;
-
- if (!sk_is_mptcp(ssk))
- continue;
-
- subflow = mptcp_subflow_ctx(ssk);
- if (!subflow || !subflow->conn)
- continue;
-
- /* skip if already in list */
- msk = mptcp_sk(subflow->conn);
- if (msk->dl_next || msk == head)
- continue;
-
- msk->dl_next = head;
- head = msk;
- }
+ head = queue->rskq_accept_head;
+ tail = queue->rskq_accept_tail;
+ queue->rskq_accept_head = NULL;
+ queue->rskq_accept_tail = NULL;
spin_unlock_bh(&queue->rskq_lock);
+
if (!head)
return;
@@ -1829,40 +1922,49 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s
*/
release_sock(listener_ssk);
- for (msk = head; msk; msk = next) {
- struct sock *sk = (struct sock *)msk;
- bool do_cancel_work;
+ for (req = head; req; req = req->dl_next) {
+ ssk = req->sk;
+ if (!sk_is_mptcp(ssk))
+ continue;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ if (!subflow || !subflow->conn)
+ continue;
+ sk = subflow->conn;
sock_hold(sk);
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
- next = msk->dl_next;
- msk->first = NULL;
- msk->dl_next = NULL;
- do_cancel_work = __mptcp_close(sk, 0);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ __mptcp_unaccepted_force_close(sk);
release_sock(sk);
- if (do_cancel_work) {
- /* lockdep will report a false positive ABBA deadlock
- * between cancel_work_sync and the listener socket.
- * The involved locks belong to different sockets WRT
- * the existing AB chain.
- * Using a per socket key is problematic as key
- * deregistration requires process context and must be
- * performed at socket disposal time, in atomic
- * context.
- * Just tell lockdep to consider the listener socket
- * released here.
- */
- mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
- mptcp_cancel_work(sk);
- mutex_acquire(&listener_sk->sk_lock.dep_map,
- SINGLE_DEPTH_NESTING, 0, _RET_IP_);
- }
+
+ /* lockdep will report a false positive ABBA deadlock
+ * between cancel_work_sync and the listener socket.
+ * The involved locks belong to different sockets WRT
+ * the existing AB chain.
+ * Using a per socket key is problematic as key
+ * deregistration requires process context and must be
+ * performed at socket disposal time, in atomic
+ * context.
+ * Just tell lockdep to consider the listener socket
+ * released here.
+ */
+ mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
+ mptcp_cancel_work(sk);
+ mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+
sock_put(sk);
}
/* we are still under the listener msk socket lock */
lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
+
+ /* restore the listener queue, to let the TCP code clean it up */
+ spin_lock_bh(&queue->rskq_lock);
+ WARN_ON_ONCE(queue->rskq_accept_head);
+ queue->rskq_accept_head = head;
+ queue->rskq_accept_tail = tail;
+ spin_unlock_bh(&queue->rskq_lock);
}
static int subflow_ulp_init(struct sock *sk)
@@ -1886,7 +1988,7 @@ static int subflow_ulp_init(struct sock *sk)
goto out;
}
- pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
+ pr_debug("subflow=%p, family=%d\n", ctx, sk->sk_family);
tp->is_mptcp = 1;
ctx->icsk_af_ops = icsk->icsk_af_ops;
@@ -1921,6 +2023,13 @@ static void subflow_ulp_release(struct sock *ssk)
* when the subflow is still unaccepted
*/
release = ctx->disposable || list_empty(&ctx->node);
+
+ /* inet_child_forget() does not call sk_state_change(),
+ * explicitly trigger the socket close machinery
+ */
+ if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW,
+ &mptcp_sk(sk)->flags))
+ mptcp_schedule_work(sk);
sock_put(sk);
}
@@ -1954,7 +2063,6 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->tcp_state_change = old_ctx->tcp_state_change;
new_ctx->tcp_error_report = old_ctx->tcp_error_report;
new_ctx->rel_write_seq = 1;
- new_ctx->tcp_sock = newsk;
if (subflow_req->mp_capable) {
/* see comments in subflow_syn_recv_sock(), MPTCP connection
@@ -1967,14 +2075,15 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->idsn = subflow_req->idsn;
/* this is the first subflow, id is always 0 */
- new_ctx->local_id_valid = 1;
+ subflow_set_local_id(new_ctx, 0);
} else if (subflow_req->mp_join) {
new_ctx->ssn_offset = subflow_req->ssn_offset;
new_ctx->mp_join = 1;
- new_ctx->fully_established = 1;
+ WRITE_ONCE(new_ctx->fully_established, true);
new_ctx->remote_key_valid = 1;
new_ctx->backup = subflow_req->backup;
- new_ctx->remote_id = subflow_req->remote_id;
+ new_ctx->request_bkup = subflow_req->request_bkup;
+ WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id);
new_ctx->token = subflow_req->token;
new_ctx->thmac = subflow_req->thmac;
@@ -1988,13 +2097,30 @@ static void subflow_ulp_clone(const struct request_sock *req,
static void tcp_release_cb_override(struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ long status;
- if (mptcp_subflow_has_delegated_action(subflow))
- mptcp_subflow_process_delegated(ssk);
+ /* process and clear all the pending actions, but leave the subflow into
+ * the napi queue. To respect locking, only the same CPU that originated
+ * the action can touch the list. mptcp_napi_poll will take care of it.
+ */
+ status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0);
+ if (status)
+ mptcp_subflow_process_delegated(ssk, status);
tcp_release_cb(ssk);
}
+static int tcp_abort_override(struct sock *ssk, int err)
+{
+ /* closing a listener subflow requires a great deal of care.
+ * keep it simple and just prevent such operation
+ */
+ if (inet_sk_state_load(ssk) == TCP_LISTEN)
+ return -EINVAL;
+
+ return tcp_abort(ssk, err);
+}
+
static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
.name = "mptcp",
.owner = THIS_MODULE,
@@ -2039,6 +2165,11 @@ void __init mptcp_subflow_init(void)
tcp_prot_override = tcp_prot;
tcp_prot_override.release_cb = tcp_release_cb_override;
+ tcp_prot_override.diag_destroy = tcp_abort_override;
+#ifdef CONFIG_BPF_SYSCALL
+ /* Disable sockmap processing for subflows */
+ tcp_prot_override.psock_update_sk_prot = NULL;
+#endif
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
/* In struct mptcp_subflow_request_sock, we assume the TCP request sock
@@ -2070,11 +2201,15 @@ void __init mptcp_subflow_init(void)
subflow_v6m_specific.send_check = ipv4_specific.send_check;
subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
- subflow_v6m_specific.net_frag_header_len = 0;
subflow_v6m_specific.rebuild_header = subflow_rebuild_header;
tcpv6_prot_override = tcpv6_prot;
tcpv6_prot_override.release_cb = tcp_release_cb_override;
+ tcpv6_prot_override.diag_destroy = tcp_abort_override;
+#ifdef CONFIG_BPF_SYSCALL
+ /* Disable sockmap processing for subflows */
+ tcpv6_prot_override.psock_update_sk_prot = NULL;
+#endif
#endif
mptcp_diag_subflow_init(&subflow_ulp_ops);
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 65430f314a68..5bb924534387 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -134,7 +134,7 @@ int mptcp_token_new_request(struct request_sock *req)
/**
* mptcp_token_new_connect - create new key/idsn/token for subflow
- * @sk: the socket that will initiate a connection
+ * @ssk: the socket that will initiate a connection
*
* This function is called when a new outgoing mptcp connection is
* initiated.
@@ -148,11 +148,12 @@ int mptcp_token_new_request(struct request_sock *req)
*
* returns 0 on success.
*/
-int mptcp_token_new_connect(struct sock *sk)
+int mptcp_token_new_connect(struct sock *ssk)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
int retries = MPTCP_TOKEN_MAX_RETRIES;
+ struct sock *sk = subflow->conn;
struct token_bucket *bucket;
again:
@@ -169,12 +170,13 @@ again:
}
pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n",
- sk, subflow->local_key, subflow->token, subflow->idsn);
+ ssk, subflow->local_key, subflow->token, subflow->idsn);
WRITE_ONCE(msk->token, subflow->token);
__sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain);
bucket->chain_len++;
spin_unlock_bh(&bucket->lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
return 0;
}
@@ -190,8 +192,10 @@ void mptcp_token_accept(struct mptcp_subflow_request_sock *req,
struct mptcp_sock *msk)
{
struct mptcp_subflow_request_sock *pos;
+ struct sock *sk = (struct sock *)msk;
struct token_bucket *bucket;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
bucket = token_bucket(req->token);
spin_lock_bh(&bucket->lock);
@@ -370,12 +374,14 @@ void mptcp_token_destroy_request(struct request_sock *req)
*/
void mptcp_token_destroy(struct mptcp_sock *msk)
{
+ struct sock *sk = (struct sock *)msk;
struct token_bucket *bucket;
struct mptcp_sock *pos;
if (sk_unhashed((struct sock *)msk))
return;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
bucket = token_bucket(msk->token);
spin_lock_bh(&bucket->lock);
pos = __token_lookup_msk(bucket, msk->token);
diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c
index 5d984bec1cd8..4fc39fa2e262 100644
--- a/net/mptcp/token_test.c
+++ b/net/mptcp/token_test.c
@@ -52,11 +52,19 @@ static struct mptcp_subflow_context *build_ctx(struct kunit *test)
static struct mptcp_sock *build_msk(struct kunit *test)
{
struct mptcp_sock *msk;
+ struct sock *sk;
msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER);
KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk);
refcount_set(&((struct sock *)msk)->sk_refcnt, 1);
sock_net_set((struct sock *)msk, &init_net);
+
+ sk = (struct sock *)msk;
+
+ /* be sure the token helpers can dereference sk->sk_prot */
+ sk->sk_prot = &tcp_prot;
+ sk->sk_protocol = IPPROTO_MPTCP;
+
return msk;
}
@@ -140,3 +148,4 @@ static struct kunit_suite mptcp_token_suite = {
kunit_test_suite(mptcp_token_suite);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("KUnit tests for MPTCP Token");