From 72f961320d5d15bfcb26dbe3edaa3f7d25fd2c8a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Jun 2021 15:59:40 -0700 Subject: mptcp: try harder to borrow memory from subflow under pressure If the host is under sever memory pressure, and RX forward memory allocation for the msk fails, we try to borrow the required memory from the ingress subflow. The current attempt is a bit flaky: if skb->truesize is less than SK_MEM_QUANTUM, the ssk will not release any memory, and the next schedule will fail again. Instead, directly move the required amount of pages from the ssk to the msk, if available Fixes: 9c3f94e1681b ("mptcp: add missing memory scheduling in the rx path") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/mptcp/protocol.c') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5edc686faff1..534cf500521d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -280,11 +280,13 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, /* try to fetch required memory from subflow */ if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - if (ssk->sk_forward_alloc < skb->truesize) - goto drop; - __sk_mem_reclaim(ssk, skb->truesize); - if (!sk_rmem_schedule(sk, skb, skb->truesize)) + int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT; + + if (ssk->sk_forward_alloc < amount) goto drop; + + ssk->sk_forward_alloc -= amount; + sk->sk_forward_alloc += amount; } /* the skb map_seq accounts for the skb offset: -- cgit From 99d1055ce2469dca3dd14be0991ff8133e25e3d0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Jun 2021 15:59:41 -0700 Subject: mptcp: wake-up readers only for in sequence data Currently we rely on the subflow->data_avail field, which is subject to races: ssk1 skb len = 500 DSS(seq=1, len=1000, off=0) # data_avail == MPTCP_SUBFLOW_DATA_AVAIL ssk2 skb len = 500 DSS(seq = 501, len=1000) # data_avail == MPTCP_SUBFLOW_DATA_AVAIL ssk1 skb len = 500 DSS(seq = 1, len=1000, off =500) # still data_avail == MPTCP_SUBFLOW_DATA_AVAIL, # as the skb is covered by a pre-existing map, # which was in-sequence at reception time. Instead we can explicitly check if some has been received in-sequence, propagating the info from __mptcp_move_skbs_from_subflow(). Additionally add the 'ONCE' annotation to the 'data_avail' memory access, as msk will read it outside the subflow socket lock. Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'net/mptcp/protocol.c') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 534cf500521d..f6e62a6dc9fb 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -670,15 +670,13 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ -static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) +static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; if (inet_sk_state_load(sk) == TCP_CLOSE) - return; - - mptcp_data_lock(sk); + return false; __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); @@ -690,7 +688,7 @@ static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); - mptcp_data_unlock(sk); + return moved > 0; } void mptcp_data_ready(struct sock *sk, struct sock *ssk) @@ -698,7 +696,6 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); int sk_rbuf, ssk_rbuf; - bool wake; /* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing @@ -707,28 +704,22 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (unlikely(subflow->disposable)) return; - /* move_skbs_to_msk below can legitly clear the data_avail flag, - * but we will need later to properly woke the reader, cache its - * value - */ - wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL; - if (wake) - set_bit(MPTCP_DATA_READY, &msk->flags); - ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); sk_rbuf = READ_ONCE(sk->sk_rcvbuf); if (unlikely(ssk_rbuf > sk_rbuf)) sk_rbuf = ssk_rbuf; - /* over limit? can't append more skbs to msk */ + /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) - goto wake; - - move_skbs_to_msk(msk, ssk); + return; -wake: - if (wake) + /* Wake-up the reader only for in-sequence data */ + mptcp_data_lock(sk); + if (move_skbs_to_msk(msk, ssk)) { + set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); + } + mptcp_data_unlock(sk); } static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) @@ -860,7 +851,7 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) sock_owned_by_me(sk); mptcp_for_each_subflow(msk, subflow) { - if (subflow->data_avail) + if (READ_ONCE(subflow->data_avail)) return mptcp_subflow_tcp_sock(subflow); } -- cgit From 499ada5073361c631f2a3c4a8aed44d53b6f82ec Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Jun 2021 15:59:44 -0700 Subject: mptcp: fix soft lookup in subflow_error_report() Maxim reported a soft lookup in subflow_error_report(): watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:0] RIP: 0010:native_queued_spin_lock_slowpath RSP: 0018:ffffa859c0003bc0 EFLAGS: 00000202 RAX: 0000000000000101 RBX: 0000000000000001 RCX: 0000000000000000 RDX: ffff9195c2772d88 RSI: 0000000000000000 RDI: ffff9195c2772d88 RBP: ffff9195c2772d00 R08: 00000000000067b0 R09: c6e31da9eb1e44f4 R10: ffff9195ef379700 R11: ffff9195edb50710 R12: ffff9195c2772d88 R13: ffff9195f500e3d0 R14: ffff9195ef379700 R15: ffff9195ef379700 FS: 0000000000000000(0000) GS:ffff91961f400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c000407000 CR3: 0000000002988000 CR4: 00000000000006f0 Call Trace: _raw_spin_lock_bh subflow_error_report mptcp_subflow_data_available __mptcp_move_skbs_from_subflow mptcp_data_ready tcp_data_queue tcp_rcv_established tcp_v4_do_rcv tcp_v4_rcv ip_protocol_deliver_rcu ip_local_deliver_finish __netif_receive_skb_one_core netif_receive_skb rtl8139_poll 8139too __napi_poll net_rx_action __do_softirq __irq_exit_rcu common_interrupt The calling function - mptcp_subflow_data_available() - can be invoked from different contexts: - plain ssk socket lock - ssk socket lock + mptcp_data_lock - ssk socket lock + mptcp_data_lock + msk socket lock. Since subflow_error_report() tries to acquire the mptcp_data_lock, the latter two call chains will cause soft lookup. This change addresses the issue moving the error reporting call to outer functions, where the held locks list is known and the we can acquire only the needed one. Reported-by: Maxim Galaganov Fixes: 15cc10453398 ("mptcp: deliver ssk errors to msk") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/199 Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/mptcp/protocol.c') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f6e62a6dc9fb..632350018fb6 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -680,6 +680,12 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); + if (unlikely(ssk->sk_err)) { + if (!sock_owned_by_user(sk)) + __mptcp_error_report(sk); + else + set_bit(MPTCP_ERROR_REPORT, &msk->flags); + } /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but @@ -1948,6 +1954,9 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); tcp_cleanup_rbuf(ssk, moved); + + if (unlikely(ssk->sk_err)) + __mptcp_error_report(sk); unlock_sock_fast(ssk, slowpath); } while (!done); -- cgit