From 98247bc16a27cf8ead4c47ce9f15888be85841fc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:40 -0800 Subject: mptcp: fix race in overlapping signal events After commit a88c9e496937 ("mptcp: do not block subflows creation on errors"), if a signal address races with a failing subflow creation, the subflow creation failure control path can trigger the selection of the next address to be announced while the current announced is still pending. The above will cause the unintended suppression of the ADD_ADDR announce. Fix the issue skipping the to-be-suppressed announce before it will mark an endpoint as already used. The relevant announce will be triggered again when the current one will complete. Fixes: a88c9e496937 ("mptcp: do not block subflows creation on errors") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 356f596e2032..82f82a513f5b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -546,6 +546,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (msk->pm.add_addr_signaled < add_addr_signal_max) { local = select_signal_address(pernet, msk); + /* due to racing events on both ends we can reach here while + * previous add address is still running: if we invoke now + * mptcp_pm_announce_addr(), that will fail and the + * corresponding id will be marked as used. + * Instead let the PM machinery reschedule us when the + * current address announce will be completed. + */ + if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) + return; + if (local) { if (mptcp_pm_alloc_anno_list(msk, local)) { __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); -- cgit From 837cf45df163a3780bc04b555700231e95b31dc9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:41 -0800 Subject: mptcp: fix race in incoming ADD_ADDR option processing If an MPTCP endpoint received multiple consecutive incoming ADD_ADDR options, mptcp_pm_add_addr_received() can overwrite the current remote address value after the PM lock is released in mptcp_pm_nl_add_addr_received() and before such address is echoed. Fix the issue caching the remote address value a little earlier and always using the cached value after releasing the PM lock. Fixes: f7efc7771eac ("mptcp: drop argument port from mptcp_pm_announce_addr") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 82f82a513f5b..4b5d795383cd 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -660,6 +660,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) unsigned int add_addr_accept_max; struct mptcp_addr_info remote; unsigned int subflows_max; + bool reset_port = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); @@ -669,15 +670,19 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); - if (lookup_subflow_by_daddr(&msk->conn_list, &msk->pm.remote)) + remote = msk->pm.remote; + if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) goto add_addr_echo; + /* pick id 0 port, if none is provided the remote address */ + if (!remote.port) { + reset_port = true; + remote.port = sk->sk_dport; + } + /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ - remote = msk->pm.remote; - if (!remote.port) - remote.port = sk->sk_dport; nr = fill_local_addresses_vec(msk, addrs); msk->pm.add_addr_accepted++; @@ -690,8 +695,12 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) __mptcp_subflow_connect(sk, &addrs[i], &remote); spin_lock_bh(&msk->pm.lock); + /* be sure to echo exactly the received address */ + if (reset_port) + remote.port = 0; + add_addr_echo: - mptcp_pm_announce_addr(msk, &msk->pm.remote, true); + mptcp_pm_announce_addr(msk, &remote, true); mptcp_pm_nl_addr_send_ack(msk); } -- cgit From f73c1194634506ab60af0debef04671fc431a435 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:42 -0800 Subject: mptcp: add mibs counter for ignored incoming options The MPTCP in kernel path manager has some constraints on incoming addresses announce processing, so that in edge scenarios it can end-up dropping (ignoring) some of such announces. The above is not very limiting in practice since such scenarios are very uncommon and MPTCP will recover due to ADD_ADDR retransmissions. This patch adds a few MIB counters to account for such drop events to allow easier introspection of the critical scenarios. Fixes: f7efc7771eac ("mptcp: drop argument port from mptcp_pm_announce_addr") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/mib.c | 2 ++ net/mptcp/mib.h | 2 ++ net/mptcp/pm.c | 8 ++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 3240b72271a7..7558802a1435 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -35,12 +35,14 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), + SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), + SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index ecd3d8b117e0..2966fcb6548b 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -28,12 +28,14 @@ enum linux_mptcp_mib_field { MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */ + MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */ MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */ MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */ MPTCP_MIB_RMADDR, /* Received RM_ADDR */ + MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 696b2c4613a7..7bea318ac5f2 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -213,6 +213,8 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; + } else { + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } spin_unlock_bh(&pm->lock); @@ -253,8 +255,10 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, mptcp_event_addr_removed(msk, rm_list->ids[i]); spin_lock_bh(&pm->lock); - mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED); - pm->rm_list_rx = *rm_list; + if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED)) + pm->rm_list_rx = *rm_list; + else + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP); spin_unlock_bh(&pm->lock); } -- cgit From 07c2c7a3b622e109ba4d2efd916da0477617ce81 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 24 Feb 2022 16:52:57 -0800 Subject: mptcp: accurate SIOCOUTQ for fallback socket The MPTCP SIOCOUTQ implementation is not very accurate in case of fallback: it only measures the data in the MPTCP-level write queue, but it does not take in account the subflow write queue utilization. In case of fallback the first can be empty, while the latter is not. The above produces sporadic self-tests issues and can foul legit user-space application. Fix the issue additionally querying the subflow in case of fallback. Fixes: 644807e3e462 ("mptcp: add SIOCINQ, OUTQ and OUTQNSD ioctls") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/260 Reported-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f60f01b14fac..12bb28c5007e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3294,6 +3294,17 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) return 0; delta = msk->write_seq - v; + if (__mptcp_check_fallback(msk) && msk->first) { + struct tcp_sock *tp = tcp_sk(msk->first); + + /* the first subflow is disconnected after close - see + * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq + * so ignore that status, too. + */ + if (!((1 << msk->first->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) + delta += READ_ONCE(tp->write_seq) - tp->snd_una; + } if (delta > INT_MAX) delta = INT_MAX; -- cgit From 877d11f0332cd2160e19e3313e262754c321fa36 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 24 Feb 2022 16:52:59 -0800 Subject: mptcp: Correctly set DATA_FIN timeout when number of retransmits is large Syzkaller with UBSAN uncovered a scenario where a large number of DATA_FIN retransmits caused a shift-out-of-bounds in the DATA_FIN timeout calculation: ================================================================================ UBSAN: shift-out-of-bounds in net/mptcp/protocol.c:470:29 shift exponent 32 is too large for 32-bit type 'unsigned int' CPU: 1 PID: 13059 Comm: kworker/1:0 Not tainted 5.17.0-rc2-00630-g5fbf21c90c60 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: events mptcp_worker Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151 __ubsan_handle_shift_out_of_bounds.cold+0xb2/0x20e lib/ubsan.c:330 mptcp_set_datafin_timeout net/mptcp/protocol.c:470 [inline] __mptcp_retrans.cold+0x72/0x77 net/mptcp/protocol.c:2445 mptcp_worker+0x58a/0xa70 net/mptcp/protocol.c:2528 process_one_work+0x9df/0x16d0 kernel/workqueue.c:2307 worker_thread+0x95/0xe10 kernel/workqueue.c:2454 kthread+0x2f4/0x3b0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 ================================================================================ This change limits the maximum timeout by limiting the size of the shift, which keeps all intermediate values in-bounds. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/259 Fixes: 6477dd39e62c ("mptcp: Retransmit DATA_FIN") Acked-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 12bb28c5007e..1c72f25f083e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -466,9 +466,12 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) static void mptcp_set_datafin_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + u32 retransmits; - mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, - TCP_RTO_MIN << icsk->icsk_retransmits); + retransmits = min_t(u32, icsk->icsk_retransmits, + ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); + + mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) -- cgit