diff options
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/mib.c | 17 | ||||
-rw-r--r-- | net/mptcp/mptcp_diag.c | 26 | ||||
-rw-r--r-- | net/mptcp/options.c | 15 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 9 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 447 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 19 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 279 |
7 files changed, 525 insertions, 287 deletions
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index b21ff9be04c6..3240b72271a7 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -72,6 +72,7 @@ bool mptcp_mib_alloc(struct net *net) void mptcp_seq_show(struct seq_file *seq) { + unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1]; struct net *net = seq->private; int i; @@ -81,17 +82,13 @@ void mptcp_seq_show(struct seq_file *seq) seq_puts(seq, "\nMPTcpExt:"); - if (!net->mib.mptcp_statistics) { - for (i = 0; mptcp_snmp_list[i].name; i++) - seq_puts(seq, " 0"); - - seq_putc(seq, '\n'); - return; - } + memset(sum, 0, sizeof(sum)); + if (net->mib.mptcp_statistics) + snmp_get_cpu_field_batch(sum, mptcp_snmp_list, + net->mib.mptcp_statistics); for (i = 0; mptcp_snmp_list[i].name; i++) - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.mptcp_statistics, - mptcp_snmp_list[i].entry)); + seq_printf(seq, " %lu", sum[i]); + seq_putc(seq, '\n'); } diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 292374fb0779..f44125dd6697 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -113,37 +113,13 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_info *info = _info; - u32 flags = 0; - bool slow; - u8 val; r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); if (!info) return; - slow = lock_sock_fast(sk); - info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); - info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); - info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); - info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); - info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk); - val = mptcp_pm_get_add_addr_signal_max(msk); - info->mptcpi_add_addr_signal_max = val; - val = mptcp_pm_get_add_addr_accept_max(msk); - info->mptcpi_add_addr_accepted_max = val; - info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); - if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) - flags |= MPTCP_INFO_FLAG_FALLBACK; - if (READ_ONCE(msk->can_ack)) - flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; - info->mptcpi_flags = flags; - info->mptcpi_token = READ_ONCE(msk->token); - info->mptcpi_write_seq = READ_ONCE(msk->write_seq); - info->mptcpi_snd_una = READ_ONCE(msk->snd_una); - info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); - info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); - unlock_sock_fast(sk, slow); + mptcp_diag_fill_info(msk, info); } static const struct inet_diag_handler mptcp_diag_handler = { diff --git a/net/mptcp/options.c b/net/mptcp/options.c index f0f22eb4fd5f..7c3420afb1a0 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -748,9 +748,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, /* can't send MP_PRIO with MPC, as they share the same option space: * 'backup'. Also it makes no sense at all */ - if (!subflow->send_mp_prio || - ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | - OPTION_MPTCP_MPC_ACK) & opts->suboptions)) + if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC)) return false; /* account for the trailing 'nop' option */ @@ -1019,11 +1017,9 @@ static void ack_update_msk(struct mptcp_sock *msk, old_snd_una = msk->snd_una; new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); - /* ACK for data not even sent yet and even above recovery bound? Ignore.*/ - if (unlikely(after64(new_snd_una, snd_nxt))) { - if (!msk->recovery || after64(new_snd_una, msk->recovery_snd_nxt)) - new_snd_una = old_snd_una; - } + /* ACK for data not even sent yet? Ignore.*/ + if (unlikely(after64(new_snd_una, snd_nxt))) + new_snd_una = old_snd_una; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; @@ -1335,8 +1331,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } } - } else if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | - OPTION_MPTCP_MPC_ACK) & opts->suboptions) { + } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 050eea231528..7b96be1e9f14 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -654,9 +654,9 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) } } -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - u8 bkup) +static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + u8 bkup) { struct mptcp_subflow_context *subflow; @@ -2052,6 +2052,9 @@ static int __net_init pm_nl_init_net(struct net *net) struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + + /* Cit. 2 subflows ought to be enough for anybody. */ + pernet->subflows_max = 2; pernet->next_id = 1; pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d073b2111382..b7e32e316738 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } +static void mptcp_rmem_charge(struct sock *sk, int size) +{ + mptcp_sk(sk)->rmem_fwd_alloc -= size; +} + static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { @@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; kfree_skb_partial(from, fragstolen); atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); + mptcp_rmem_charge(sk, delta); return true; } @@ -155,6 +160,44 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, return mptcp_try_coalesce((struct sock *)msk, to, from); } +static void __mptcp_rmem_reclaim(struct sock *sk, int amount) +{ + amount >>= SK_MEM_QUANTUM_SHIFT; + mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + __sk_mem_reduce_allocated(sk, amount); +} + +static void mptcp_rmem_uncharge(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int reclaimable; + + msk->rmem_fwd_alloc += size; + reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); + + /* see sk_mem_uncharge() for the rationale behind the following schema */ + if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD)) + __mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK); +} + +static void mptcp_rfree(struct sk_buff *skb) +{ + unsigned int len = skb->truesize; + struct sock *sk = skb->sk; + + atomic_sub(len, &sk->sk_rmem_alloc); + mptcp_rmem_uncharge(sk, len); +} + +static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = mptcp_rfree; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + mptcp_rmem_charge(sk, skb->truesize); +} + /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks @@ -267,7 +310,29 @@ merge_right: end: skb_condense(skb); - skb_set_owner_r(skb, sk); + mptcp_set_owner_r(skb, sk); +} + +static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int amt, amount; + + if (size < msk->rmem_fwd_alloc) + return true; + + amt = sk_mem_pages(size); + amount = amt << SK_MEM_QUANTUM_SHIFT; + msk->rmem_fwd_alloc += amount; + if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) { + if (ssk->sk_forward_alloc < amount) { + msk->rmem_fwd_alloc -= amount; + return false; + } + + ssk->sk_forward_alloc -= amount; + } + return true; } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -285,15 +350,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_orphan(skb); /* try to fetch required memory from subflow */ - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT; - - if (ssk->sk_forward_alloc < amount) - goto drop; - - ssk->sk_forward_alloc -= amount; - sk->sk_forward_alloc += amount; - } + if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) + goto drop; has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; @@ -313,7 +371,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; - skb_set_owner_r(skb, sk); + mptcp_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { @@ -908,124 +966,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static int mptcp_wmem_with_overhead(int size) -{ - return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); -} - -static void __mptcp_wmem_reserve(struct sock *sk, int size) -{ - int amount = mptcp_wmem_with_overhead(size); - struct mptcp_sock *msk = mptcp_sk(sk); - - WARN_ON_ONCE(msk->wmem_reserved); - if (WARN_ON_ONCE(amount < 0)) - amount = 0; - - if (amount <= sk->sk_forward_alloc) - goto reserve; - - /* under memory pressure try to reserve at most a single page - * otherwise try to reserve the full estimate and fallback - * to a single page before entering the error path - */ - if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) || - !sk_wmem_schedule(sk, amount)) { - if (amount <= PAGE_SIZE) - goto nomem; - - amount = PAGE_SIZE; - if (!sk_wmem_schedule(sk, amount)) - goto nomem; - } - -reserve: - msk->wmem_reserved = amount; - sk->sk_forward_alloc -= amount; - return; - -nomem: - /* we will wait for memory on next allocation */ - msk->wmem_reserved = -1; -} - -static void __mptcp_update_wmem(struct sock *sk) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - -#ifdef CONFIG_LOCKDEP - WARN_ON_ONCE(!lockdep_is_held(&sk->sk_lock.slock)); -#endif - - if (!msk->wmem_reserved) - return; - - if (msk->wmem_reserved < 0) - msk->wmem_reserved = 0; - if (msk->wmem_reserved > 0) { - sk->sk_forward_alloc += msk->wmem_reserved; - msk->wmem_reserved = 0; - } -} - -static bool mptcp_wmem_alloc(struct sock *sk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - /* check for pre-existing error condition */ - if (msk->wmem_reserved < 0) - return false; - - if (msk->wmem_reserved >= size) - goto account; - - mptcp_data_lock(sk); - if (!sk_wmem_schedule(sk, size)) { - mptcp_data_unlock(sk); - return false; - } - - sk->sk_forward_alloc -= size; - msk->wmem_reserved += size; - mptcp_data_unlock(sk); - -account: - msk->wmem_reserved -= size; - return true; -} - -static void mptcp_wmem_uncharge(struct sock *sk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - if (msk->wmem_reserved < 0) - msk->wmem_reserved = 0; - msk->wmem_reserved += size; -} - static void __mptcp_mem_reclaim_partial(struct sock *sk) { + int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk); + lockdep_assert_held_once(&sk->sk_lock.slock); - __mptcp_update_wmem(sk); + + __mptcp_rmem_reclaim(sk, reclaimable - 1); sk_mem_reclaim_partial(sk); } static void mptcp_mem_reclaim_partial(struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); - - /* if we are experiencing a transint allocation error, - * the forward allocation memory has been already - * released - */ - if (msk->wmem_reserved < 0) - return; - mptcp_data_lock(sk); - sk->sk_forward_alloc += msk->wmem_reserved; - sk_mem_reclaim_partial(sk); - msk->wmem_reserved = sk->sk_forward_alloc; - sk->sk_forward_alloc = 0; + __mptcp_mem_reclaim_partial(sk); mptcp_data_unlock(sk); } @@ -1104,7 +1058,8 @@ out: if (cleaned && tcp_under_memory_pressure(sk)) __mptcp_mem_reclaim_partial(sk); - if (snd_una == READ_ONCE(msk->snd_nxt) && !msk->recovery) { + if (snd_una == READ_ONCE(msk->snd_nxt) && + snd_una == READ_ONCE(msk->write_seq)) { if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_timer(sk); } else { @@ -1114,9 +1069,8 @@ out: static void __mptcp_clean_una_wakeup(struct sock *sk) { -#ifdef CONFIG_LOCKDEP - WARN_ON_ONCE(!lockdep_is_held(&sk->sk_lock.slock)); -#endif + lockdep_assert_held_once(&sk->sk_lock.slock); + __mptcp_clean_una(sk); mptcp_write_space(sk); } @@ -1220,7 +1174,8 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) if (likely(skb)) { if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); - skb->reserved_tailroom = skb->end - skb->tail; + skb->ip_summed = CHECKSUM_PARTIAL; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); @@ -1230,31 +1185,23 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) return NULL; } -static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) +static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { struct sk_buff *skb; - if (ssk->sk_tx_skb_cache) { - skb = ssk->sk_tx_skb_cache; - if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && - !__mptcp_add_ext(skb, gfp))) - return false; - return true; - } - skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) - return false; + return NULL; if (likely(sk_wmem_schedule(ssk, skb->truesize))) { - ssk->sk_tx_skb_cache = skb; - return true; + tcp_skb_entail(ssk, skb); + return skb; } kfree_skb(skb); - return false; + return NULL; } -static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) +static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; @@ -1284,23 +1231,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { u64 data_seq = dfrag->data_seq + info->sent; + int offset = dfrag->offset + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; - struct sk_buff *skb, *tail; - bool must_collapse = false; - int size_bias = 0; - int avail_size; - size_t ret = 0; + bool can_coalesce = false; + bool reuse_skb = true; + struct sk_buff *skb; + size_t copy; + int i; pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); + if (WARN_ON_ONCE(info->sent > info->limit || + info->limit > dfrag->data_len)) + return 0; + /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); - avail_size = info->size_goal; + copy = info->size_goal; + skb = tcp_write_queue_tail(ssk); - if (skb) { + if (skb && copy > skb->len) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later @@ -1313,62 +1266,79 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, goto alloc_skb; } - must_collapse = (info->size_goal > skb->len) && - (skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags); - if (must_collapse) { - size_bias = skb->len; - avail_size = info->size_goal - skb->len; + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); + if (!can_coalesce && i >= sysctl_max_skb_frags) { + tcp_mark_push(tcp_sk(ssk), skb); + goto alloc_skb; } - } + copy -= skb->len; + } else { alloc_skb: - if (!must_collapse && - !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held)) - return 0; + skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); + if (!skb) + return -ENOMEM; + + i = skb_shinfo(skb)->nr_frags; + reuse_skb = false; + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + } /* Zero window and all data acked? Probe. */ - avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); - if (avail_size == 0) { + copy = mptcp_check_allowed_size(msk, data_seq, copy); + if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); - if (skb || snd_una != msk->snd_nxt) + if (snd_una != msk->snd_nxt) { + tcp_remove_empty_skb(ssk); return 0; + } + zero_window_probe = true; data_seq = snd_una - 1; - avail_size = 1; - } + copy = 1; - if (WARN_ON_ONCE(info->sent > info->limit || - info->limit > dfrag->data_len)) - return 0; + /* all mptcp-level data is acked, no skbs should be present into the + * ssk write queue + */ + WARN_ON_ONCE(reuse_skb); + } - ret = info->limit - info->sent; - tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, - dfrag->page, dfrag->offset + info->sent, &ret); - if (!tail) { - tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); + copy = min_t(size_t, copy, info->limit - info->sent); + if (!sk_wmem_schedule(ssk, copy)) { + tcp_remove_empty_skb(ssk); return -ENOMEM; } - /* if the tail skb is still the cached one, collapsing really happened. - */ - if (skb == tail) { - TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; - mpext->data_len += ret; + if (can_coalesce) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + get_page(dfrag->page); + skb_fill_page_desc(skb, i, dfrag->page, offset, copy); + } + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk_wmem_queued_add(ssk, copy); + sk_mem_charge(ssk, copy); + WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + + /* on skb reuse we just need to update the DSS len */ + if (reuse_skb) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; + mpext->data_len += copy; WARN_ON_ONCE(zero_window_probe); goto out; } - mpext = skb_ext_find(tail, SKB_EXT_MPTCP); - if (WARN_ON_ONCE(!mpext)) { - /* should never reach here, stream corrupted */ - return -EINVAL; - } - memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; - mpext->data_len = ret; + mpext->data_len = copy; mpext->use_map = 1; mpext->dsn64 = 1; @@ -1377,18 +1347,18 @@ alloc_skb: mpext->dsn64); if (zero_window_probe) { - mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) - mptcp_update_data_checksum(tail, ret); + mptcp_update_data_checksum(skb, copy); tcp_push_pending_frames(ssk); return 0; } out: if (READ_ONCE(msk->csum_enabled)) - mptcp_update_data_checksum(tail, ret); - mptcp_subflow_ctx(ssk)->rel_write_seq += ret; - return ret; + mptcp_update_data_checksum(skb, copy); + mptcp_subflow_ctx(ssk)->rel_write_seq += copy; + return copy; } #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ @@ -1498,13 +1468,44 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) return NULL; } -static void mptcp_push_release(struct sock *sk, struct sock *ssk, - struct mptcp_sendmsg_info *info) +static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) { tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); release_sock(ssk); } +static void mptcp_update_post_push(struct mptcp_sock *msk, + struct mptcp_data_frag *dfrag, + u32 sent) +{ + u64 snd_nxt_new = dfrag->data_seq; + + dfrag->already_sent += sent; + + msk->snd_burst -= sent; + + snd_nxt_new += dfrag->already_sent; + + /* snd_nxt_new can be smaller than snd_nxt in case mptcp + * is recovering after a failover. In that event, this re-sends + * old segments. + * + * Thus compute snd_nxt_new candidate based on + * the dfrag->data_seq that was sent and the data + * that has been handed to the subflow for transmission + * and skip update in case it was old dfrag. + */ + if (likely(after64(snd_nxt_new, msk->snd_nxt))) + msk->snd_nxt = snd_nxt_new; +} + +static void mptcp_check_and_set_pending(struct sock *sk) +{ + if (mptcp_send_head(sk) && + !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +} + void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; @@ -1530,7 +1531,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) * the last round, release prev_ssk */ if (ssk != prev_ssk && prev_ssk) - mptcp_push_release(sk, prev_ssk, &info); + mptcp_push_release(prev_ssk, &info); if (!ssk) goto out; @@ -1543,24 +1544,22 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) { - mptcp_push_release(sk, ssk, &info); + mptcp_push_release(ssk, &info); goto out; } info.sent += ret; - dfrag->already_sent += ret; - msk->snd_nxt += ret; - msk->snd_burst -= ret; - msk->tx_pending_data -= ret; copied += ret; len -= ret; + + mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } /* at this point we held the socket lock for the last subflow we used */ if (ssk) - mptcp_push_release(sk, ssk, &info); + mptcp_push_release(ssk, &info); out: /* ensure the rtx timer is running */ @@ -1606,13 +1605,11 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) goto out; info.sent += ret; - dfrag->already_sent += ret; - msk->snd_nxt += ret; - msk->snd_burst -= ret; - msk->tx_pending_data -= ret; copied += ret; len -= ret; first = false; + + mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } @@ -1621,7 +1618,6 @@ out: /* __mptcp_alloc_tx_skb could have released some wmem and we are * not going to flush it via release_sock() */ - __mptcp_update_wmem(sk); if (copied) { tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); @@ -1658,7 +1654,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* silently ignore everything else */ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL; - mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len))); + lock_sock(sk); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1706,23 +1702,22 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) psize = min_t(size_t, psize, msg_data_left(msg)); total_ts = psize + frag_truesize; - if (!mptcp_wmem_alloc(sk, total_ts)) + if (!sk_wmem_schedule(sk, total_ts)) goto wait_for_memory; if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { - mptcp_wmem_uncharge(sk, psize + frag_truesize); ret = -EFAULT; goto out; } /* data successfully copied into the write queue */ + sk->sk_forward_alloc -= total_ts; copied += psize; dfrag->data_len += psize; frag_truesize += psize; pfrag->offset += frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); - msk->tx_pending_data += psize; /* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk @@ -1914,7 +1909,7 @@ static void __mptcp_update_rmem(struct sock *sk) return; atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); - sk_mem_uncharge(sk, msk->rmem_released); + mptcp_rmem_uncharge(sk, msk->rmem_released); WRITE_ONCE(msk->rmem_released, 0); } @@ -1982,7 +1977,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); - mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); + lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { copied = -ENOTCONN; goto out_err; @@ -2183,15 +2178,11 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) return false; } - /* will accept ack for reijected data before re-sending them */ - if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt)) - msk->recovery_snd_nxt = msk->snd_nxt; + msk->recovery_snd_nxt = msk->snd_nxt; msk->recovery = true; mptcp_data_unlock(sk); msk->first_pending = rtx_head; - msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq; - msk->snd_nxt = rtx_head->data_seq; msk->snd_burst = 0; /* be sure to clear the "sent status" on all re-injected fragments */ @@ -2353,6 +2344,9 @@ static void __mptcp_retrans(struct sock *sk) int ret; mptcp_clean_una_wakeup(sk); + + /* first check ssk: need to kick "stale" logic */ + ssk = mptcp_subflow_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { @@ -2365,10 +2359,12 @@ static void __mptcp_retrans(struct sock *sk) goto reset_timer; } - return; + if (!mptcp_send_head(sk)) + return; + + goto reset_timer; } - ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_timer; @@ -2395,6 +2391,8 @@ static void __mptcp_retrans(struct sock *sk) release_sock(ssk); reset_timer: + mptcp_check_and_set_pending(sk); + if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); } @@ -2459,9 +2457,8 @@ static int __mptcp_init_sock(struct sock *sk) __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; - msk->wmem_reserved = 0; + msk->rmem_fwd_alloc = 0; WRITE_ONCE(msk->rmem_released, 0); - msk->tx_pending_data = 0; msk->timer_ival = TCP_RTO_MIN; msk->first = NULL; @@ -2671,7 +2668,7 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); - WARN_ON_ONCE(msk->wmem_reserved); + WARN_ON_ONCE(msk->rmem_fwd_alloc); WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); @@ -2904,8 +2901,14 @@ void mptcp_destroy_common(struct mptcp_sock *msk) /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); - + __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); + + /* move all the rx fwd alloc into the sk_mem_reclaim_final in + * inet_sock_destruct() will dispose it + */ + sk->sk_forward_alloc += msk->rmem_fwd_alloc; + msk->rmem_fwd_alloc = 0; mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); } @@ -2987,10 +2990,6 @@ static void mptcp_release_cb(struct sock *sk) if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) __mptcp_error_report(sk); - /* push_pending may touch wmem_reserved, ensure we do the cleanup - * later - */ - __mptcp_update_wmem(sk); __mptcp_update_rmem(sk); } @@ -3140,6 +3139,11 @@ static void mptcp_shutdown(struct sock *sk, int how) __mptcp_wr_shutdown(sk); } +static int mptcp_forward_alloc_get(const struct sock *sk) +{ + return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc; +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -3157,6 +3161,7 @@ static struct proto mptcp_prot = { .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, + .forward_alloc_get = mptcp_forward_alloc_get, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index dc984676c5eb..67a61ac48b20 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -227,7 +227,7 @@ struct mptcp_sock { u64 ack_seq; u64 rcv_wnd_sent; u64 rcv_data_fin_seq; - int wmem_reserved; + int rmem_fwd_alloc; struct sock *last_snd; int snd_burst; int old_wspace; @@ -254,7 +254,6 @@ struct mptcp_sock { struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; - int tx_pending_data; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; @@ -273,19 +272,6 @@ struct mptcp_sock { char ca_name[TCP_CA_NAME_MAX]; }; -#define mptcp_lock_sock(___sk, cb) do { \ - struct sock *__sk = (___sk); /* silence macro reuse warning */ \ - might_sleep(); \ - spin_lock_bh(&__sk->sk_lock.slock); \ - if (__sk->sk_lock.owned) \ - __lock_sock(__sk); \ - cb; \ - __sk->sk_lock.owned = 1; \ - spin_unlock(&__sk->sk_lock.slock); \ - mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \ - local_bh_enable(); \ -} while (0) - #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) @@ -737,9 +723,6 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 8c03afac5ca0..0f1e661c2032 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -14,6 +14,8 @@ #include <net/mptcp.h> #include "protocol.h" +#define MIN_INFO_OPTLEN_SIZE 16 + static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { sock_owned_by_me((const struct sock *)msk); @@ -670,6 +672,266 @@ out: return ret; } +void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) +{ + struct sock *sk = &msk->sk.icsk_inet.sk; + u32 flags = 0; + bool slow; + u8 val; + + memset(info, 0, sizeof(*info)); + + slow = lock_sock_fast(sk); + + info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); + info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); + info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk); + val = mptcp_pm_get_add_addr_signal_max(msk); + info->mptcpi_add_addr_signal_max = val; + val = mptcp_pm_get_add_addr_accept_max(msk); + info->mptcpi_add_addr_accepted_max = val; + info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) + flags |= MPTCP_INFO_FLAG_FALLBACK; + if (READ_ONCE(msk->can_ack)) + flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; + info->mptcpi_flags = flags; + info->mptcpi_token = READ_ONCE(msk->token); + info->mptcpi_write_seq = READ_ONCE(msk->write_seq); + info->mptcpi_snd_una = READ_ONCE(msk->snd_una); + info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); + + unlock_sock_fast(sk, slow); +} +EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); + +static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, int __user *optlen) +{ + struct mptcp_info m_info; + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(struct mptcp_info)); + + mptcp_diag_fill_info(msk, &m_info); + + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &m_info, len)) + return -EFAULT; + + return 0; +} + +static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd, + char __user *optval, + u32 copied, + int __user *optlen) +{ + u32 copylen = min_t(u32, sfd->size_subflow_data, sizeof(*sfd)); + + if (copied) + copied += sfd->size_subflow_data; + else + copied = copylen; + + if (put_user(copied, optlen)) + return -EFAULT; + + if (copy_to_user(optval, sfd, copylen)) + return -EFAULT; + + return 0; +} + +static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd, + char __user *optval, int __user *optlen) +{ + int len, copylen; + + if (get_user(len, optlen)) + return -EFAULT; + + /* if mptcp_subflow_data size is changed, need to adjust + * this function to deal with programs using old version. + */ + BUILD_BUG_ON(sizeof(*sfd) != MIN_INFO_OPTLEN_SIZE); + + if (len < MIN_INFO_OPTLEN_SIZE) + return -EINVAL; + + memset(sfd, 0, sizeof(*sfd)); + + copylen = min_t(unsigned int, len, sizeof(*sfd)); + if (copy_from_user(sfd, optval, copylen)) + return -EFAULT; + + /* size_subflow_data is u32, but len is signed */ + if (sfd->size_subflow_data > INT_MAX || + sfd->size_user > INT_MAX) + return -EINVAL; + + if (sfd->size_subflow_data < MIN_INFO_OPTLEN_SIZE || + sfd->size_subflow_data > len) + return -EINVAL; + + if (sfd->num_subflows || sfd->size_kernel) + return -EINVAL; + + return len - sfd->size_subflow_data; +} + +static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval, + int __user *optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = &msk->sk.icsk_inet.sk; + unsigned int sfcount = 0, copied = 0; + struct mptcp_subflow_data sfd; + char __user *infoptr; + int len; + + len = mptcp_get_subflow_data(&sfd, optval, optlen); + if (len < 0) + return len; + + sfd.size_kernel = sizeof(struct tcp_info); + sfd.size_user = min_t(unsigned int, sfd.size_user, + sizeof(struct tcp_info)); + + infoptr = optval + sfd.size_subflow_data; + + lock_sock(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ++sfcount; + + if (len && len >= sfd.size_user) { + struct tcp_info info; + + tcp_get_info(ssk, &info); + + if (copy_to_user(infoptr, &info, sfd.size_user)) { + release_sock(sk); + return -EFAULT; + } + + infoptr += sfd.size_user; + copied += sfd.size_user; + len -= sfd.size_user; + } + } + + release_sock(sk); + + sfd.num_subflows = sfcount; + + if (mptcp_put_subflow_data(&sfd, optval, copied, optlen)) + return -EFAULT; + + return 0; +} + +static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a) +{ + struct inet_sock *inet = inet_sk(sk); + + memset(a, 0, sizeof(*a)); + + if (sk->sk_family == AF_INET) { + a->sin_local.sin_family = AF_INET; + a->sin_local.sin_port = inet->inet_sport; + a->sin_local.sin_addr.s_addr = inet->inet_rcv_saddr; + + if (!a->sin_local.sin_addr.s_addr) + a->sin_local.sin_addr.s_addr = inet->inet_saddr; + + a->sin_remote.sin_family = AF_INET; + a->sin_remote.sin_port = inet->inet_dport; + a->sin_remote.sin_addr.s_addr = inet->inet_daddr; +#if IS_ENABLED(CONFIG_IPV6) + } else if (sk->sk_family == AF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + + if (WARN_ON_ONCE(!np)) + return; + + a->sin6_local.sin6_family = AF_INET6; + a->sin6_local.sin6_port = inet->inet_sport; + + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + a->sin6_local.sin6_addr = np->saddr; + else + a->sin6_local.sin6_addr = sk->sk_v6_rcv_saddr; + + a->sin6_remote.sin6_family = AF_INET6; + a->sin6_remote.sin6_port = inet->inet_dport; + a->sin6_remote.sin6_addr = sk->sk_v6_daddr; +#endif + } +} + +static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *optval, + int __user *optlen) +{ + struct sock *sk = &msk->sk.icsk_inet.sk; + struct mptcp_subflow_context *subflow; + unsigned int sfcount = 0, copied = 0; + struct mptcp_subflow_data sfd; + char __user *addrptr; + int len; + + len = mptcp_get_subflow_data(&sfd, optval, optlen); + if (len < 0) + return len; + + sfd.size_kernel = sizeof(struct mptcp_subflow_addrs); + sfd.size_user = min_t(unsigned int, sfd.size_user, + sizeof(struct mptcp_subflow_addrs)); + + addrptr = optval + sfd.size_subflow_data; + + lock_sock(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ++sfcount; + + if (len && len >= sfd.size_user) { + struct mptcp_subflow_addrs a; + + mptcp_get_sub_addrs(ssk, &a); + + if (copy_to_user(addrptr, &a, sfd.size_user)) { + release_sock(sk); + return -EFAULT; + } + + addrptr += sfd.size_user; + copied += sfd.size_user; + len -= sfd.size_user; + } + } + + release_sock(sk); + + sfd.num_subflows = sfcount; + + if (mptcp_put_subflow_data(&sfd, optval, copied, optlen)) + return -EFAULT; + + return 0; +} + static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { @@ -684,6 +946,21 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } +static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, + char __user *optval, int __user *optlen) +{ + switch (optname) { + case MPTCP_INFO: + return mptcp_getsockopt_info(msk, optval, optlen); + case MPTCP_TCPINFO: + return mptcp_getsockopt_tcpinfo(msk, optval, optlen); + case MPTCP_SUBFLOW_ADDRS: + return mptcp_getsockopt_subflow_addrs(msk, optval, optlen); + } + + return -EOPNOTSUPP; +} + int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option) { @@ -706,6 +983,8 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname, if (level == SOL_TCP) return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); + if (level == SOL_MPTCP) + return mptcp_getsockopt_sol_mptcp(msk, optname, optval, option); return -EOPNOTSUPP; } |