summaryrefslogtreecommitdiff
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 16:07:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 16:07:23 -0700
commit6e98b09da931a00bf4e0477d0fa52748bf28fcce (patch)
tree9c658ed95add5693f42f29f63df80a2ede3f6ec2 /net/mptcp/protocol.c
parentb68ee1c6131c540a62ecd443be89c406401df091 (diff)
parent9b78d919632b7149d311aaad5a977e4b48b10321 (diff)
Merge tag 'net-next-6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "Core: - Introduce a config option to tweak MAX_SKB_FRAGS. Increasing the default value allows for better BIG TCP performances - Reduce compound page head access for zero-copy data transfers - RPS/RFS improvements, avoiding unneeded NET_RX_SOFTIRQ when possible - Threaded NAPI improvements, adding defer skb free support and unneeded softirq avoidance - Address dst_entry reference count scalability issues, via false sharing avoidance and optimize refcount tracking - Add lockless accesses annotation to sk_err[_soft] - Optimize again the skb struct layout - Extends the skb drop reasons to make it usable by multiple subsystems - Better const qualifier awareness for socket casts BPF: - Add skb and XDP typed dynptrs which allow BPF programs for more ergonomic and less brittle iteration through data and variable-sized accesses - Add a new BPF netfilter program type and minimal support to hook BPF programs to netfilter hooks such as prerouting or forward - Add more precise memory usage reporting for all BPF map types - Adds support for using {FOU,GUE} encap with an ipip device operating in collect_md mode and add a set of BPF kfuncs for controlling encap params - Allow BPF programs to detect at load time whether a particular kfunc exists or not, and also add support for this in light skeleton - Bigger batch of BPF verifier improvements to prepare for upcoming BPF open-coded iterators allowing for less restrictive looping capabilities - Rework RCU enforcement in the verifier, add kptr_rcu and enforce BPF programs to NULL-check before passing such pointers into kfunc - Add support for kptrs in percpu hashmaps, percpu LRU hashmaps and in local storage maps - Enable RCU semantics for task BPF kptrs and allow referenced kptr tasks to be stored in BPF maps - Add support for refcounted local kptrs to the verifier for allowing shared ownership, useful for adding a node to both the BPF list and rbtree - Add BPF verifier support for ST instructions in convert_ctx_access() which will help new -mcpu=v4 clang flag to start emitting them - Add ARM32 USDT support to libbpf - Improve bpftool's visual program dump which produces the control flow graph in a DOT format by adding C source inline annotations Protocols: - IPv4: Allow adding to IPv4 address a 'protocol' tag. Such value indicates the provenance of the IP address - IPv6: optimize route lookup, dropping unneeded R/W lock acquisition - Add the handshake upcall mechanism, allowing the user-space to implement generic TLS handshake on kernel's behalf - Bridge: support per-{Port, VLAN} neighbor suppression, increasing resilience to nodes failures - SCTP: add support for Fair Capacity and Weighted Fair Queueing schedulers - MPTCP: delay first subflow allocation up to its first usage. This will allow for later better LSM interaction - xfrm: Remove inner/outer modes from input/output path. These are not needed anymore - WiFi: - reduced neighbor report (RNR) handling for AP mode - HW timestamping support - support for randomized auth/deauth TA for PASN privacy - per-link debugfs for multi-link - TC offload support for mac80211 drivers - mac80211 mesh fast-xmit and fast-rx support - enable Wi-Fi 7 (EHT) mesh support Netfilter: - Add nf_tables 'brouting' support, to force a packet to be routed instead of being bridged - Update bridge netfilter and ovs conntrack helpers to handle IPv6 Jumbo packets properly, i.e. fetch the packet length from hop-by-hop extension header. This is needed for BIT TCP support - The iptables 32bit compat interface isn't compiled in by default anymore - Move ip(6)tables builtin icmp matches to the udptcp one. This has the advantage that icmp/icmpv6 match doesn't load the iptables/ip6tables modules anymore when iptables-nft is used - Extended netlink error report for netdevice in flowtables and netdev/chains. Allow for incrementally add/delete devices to netdev basechain. Allow to create netdev chain without device Driver API: - Remove redundant Device Control Error Reporting Enable, as PCI core has already error reporting enabled at enumeration time - Move Multicast DB netlink handlers to core, allowing devices other then bridge to use them - Allow the page_pool to directly recycle the pages from safely localized NAPI - Implement lockless TX queue stop/wake combo macros, allowing for further code de-duplication and sanitization - Add YNL support for user headers and struct attrs - Add partial YNL specification for devlink - Add partial YNL specification for ethtool - Add tc-mqprio and tc-taprio support for preemptible traffic classes - Add tx push buf len param to ethtool, specifies the maximum number of bytes of a transmitted packet a driver can push directly to the underlying device - Add basic LED support for switch/phy - Add NAPI documentation, stop relaying on external links - Convert dsa_master_ioctl() to netdev notifier. This is a preparatory work to make the hardware timestamping layer selectable by user space - Add transceiver support and improve the error messages for CAN-FD controllers New hardware / drivers: - Ethernet: - AMD/Pensando core device support - MediaTek MT7981 SoC - MediaTek MT7988 SoC - Broadcom BCM53134 embedded switch - Texas Instruments CPSW9G ethernet switch - Qualcomm EMAC3 DWMAC ethernet - StarFive JH7110 SoC - NXP CBTX ethernet PHY - WiFi: - Apple M1 Pro/Max devices - RealTek rtl8710bu/rtl8188gu - RealTek rtl8822bs, rtl8822cs and rtl8821cs SDIO chipset - Bluetooth: - Realtek RTL8821CS, RTL8851B, RTL8852BS - Mediatek MT7663, MT7922 - NXP w8997 - Actions Semi ATS2851 - QTI WCN6855 - Marvell 88W8997 - Can: - STMicroelectronics bxcan stm32f429 Drivers: - Ethernet NICs: - Intel (1G, icg): - add tracking and reporting of QBV config errors - add support for configuring max SDU for each Tx queue - Intel (100G, ice): - refactor mailbox overflow detection to support Scalable IOV - GNSS interface optimization - Intel (i40e): - support XDP multi-buffer - nVidia/Mellanox: - add the support for linux bridge multicast offload - enable TC offload for egress and engress MACVLAN over bond - add support for VxLAN GBP encap/decap flows offload - extend packet offload to fully support libreswan - support tunnel mode in mlx5 IPsec packet offload - extend XDP multi-buffer support - support MACsec VLAN offload - add support for dynamic msix vectors allocation - drop RX page_cache and fully use page_pool - implement thermal zone to report NIC temperature - Netronome/Corigine: - add support for multi-zone conntrack offload - Solarflare/Xilinx: - support offloading TC VLAN push/pop actions to the MAE - support TC decap rules - support unicast PTP - Other NICs: - Broadcom (bnxt): enforce software based freq adjustments only on shared PHC NIC - RealTek (r8169): refactor to addess ASPM issues during NAPI poll - Micrel (lan8841): add support for PTP_PF_PEROUT - Cadence (macb): enable PTP unicast - Engleder (tsnep): add XDP socket zero-copy support - virtio-net: implement exact header length guest feature - veth: add page_pool support for page recycling - vxlan: add MDB data path support - gve: add XDP support for GQI-QPL format - geneve: accept every ethertype - macvlan: allow some packets to bypass broadcast queue - mana: add support for jumbo frame - Ethernet high-speed switches: - Microchip (sparx5): Add support for TC flower templates - Ethernet embedded switches: - Broadcom (b54): - configure 6318 and 63268 RGMII ports - Marvell (mv88e6xxx): - faster C45 bus scan - Microchip: - lan966x: - add support for IS1 VCAP - better TX/RX from/to CPU performances - ksz9477: add ETS Qdisc support - ksz8: enhance static MAC table operations and error handling - sama7g5: add PTP capability - NXP (ocelot): - add support for external ports - add support for preemptible traffic classes - Texas Instruments: - add CPSWxG SGMII support for J7200 and J721E - Intel WiFi (iwlwifi): - preparation for Wi-Fi 7 EHT and multi-link support - EHT (Wi-Fi 7) sniffer support - hardware timestamping support for some devices/firwmares - TX beacon protection on newer hardware - Qualcomm 802.11ax WiFi (ath11k): - MU-MIMO parameters support - ack signal support for management packets - RealTek WiFi (rtw88): - SDIO bus support - better support for some SDIO devices (e.g. MAC address from efuse) - RealTek WiFi (rtw89): - HW scan support for 8852b - better support for 6 GHz scanning - support for various newer firmware APIs - framework firmware backwards compatibility - MediaTek WiFi (mt76): - P2P support - mesh A-MSDU support - EHT (Wi-Fi 7) support - coredump support" * tag 'net-next-6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2078 commits) net: phy: hide the PHYLIB_LEDS knob net: phy: marvell-88x2222: remove unnecessary (void*) conversions tcp/udp: Fix memleaks of sk and zerocopy skbs with TX timestamp. net: amd: Fix link leak when verifying config failed net: phy: marvell: Fix inconsistent indenting in led_blink_set lan966x: Don't use xdp_frame when action is XDP_TX tsnep: Add XDP socket zero-copy TX support tsnep: Add XDP socket zero-copy RX support tsnep: Move skb receive action to separate function tsnep: Add functions for queue enable/disable tsnep: Rework TX/RX queue initialization tsnep: Replace modulo operation with mask net: phy: dp83867: Add led_brightness_set support net: phy: Fix reading LED reg property drivers: nfc: nfcsim: remove return value check of `dev_dir` net: phy: dp83867: Remove unnecessary (void*) conversions net: ethtool: coalesce: try to make user settings stick twice net: mana: Check if netdev/napi_alloc_frag returns single page net: mana: Rename mana_refill_rxoob and remove some empty lines net: veth: add page_pool stats ...
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c107
1 files changed, 69 insertions, 38 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index b998e9df53ce..08dc53f56bc2 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -49,18 +49,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk);
DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
static struct net_device mptcp_napi_dev;
-/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
- * completed yet or has failed, return the subflow socket.
- * Otherwise return NULL.
- */
-struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
-{
- if (!msk->subflow || READ_ONCE(msk->can_ack))
- return NULL;
-
- return msk->subflow;
-}
-
/* Returns end sequence number of the receiver's advertised window */
static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
{
@@ -116,6 +104,31 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
return 0;
}
+/* If the MPC handshake is not started, returns the first subflow,
+ * eventually allocating it.
+ */
+struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ int ret;
+
+ if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ return ERR_PTR(-EINVAL);
+
+ if (!msk->subflow) {
+ if (msk->first)
+ return ERR_PTR(-EINVAL);
+
+ ret = __mptcp_socket_create(msk);
+ if (ret)
+ return ERR_PTR(ret);
+
+ mptcp_sockopt_sync(msk, msk->first);
+ }
+
+ return msk->subflow;
+}
+
static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
{
sk_drops_add(sk, skb);
@@ -459,7 +472,7 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
return false;
}
-static void mptcp_set_datafin_timeout(const struct sock *sk)
+static void mptcp_set_datafin_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 retransmits;
@@ -1662,13 +1675,31 @@ static void mptcp_set_nospace(struct sock *sk)
static int mptcp_disconnect(struct sock *sk, int flags);
-static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg,
+static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
size_t len, int *copied_syn)
{
unsigned int saved_flags = msg->msg_flags;
struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *ssock;
+ struct sock *ssk;
int ret;
+ /* on flags based fastopen the mptcp is supposed to create the
+ * first subflow right now. Otherwise we are in the defer_connect
+ * path, and the first subflow must be already present.
+ * Since the defer_connect flag is cleared after the first succsful
+ * fastopen attempt, no need to check for additional subflow status.
+ */
+ if (msg->msg_flags & MSG_FASTOPEN) {
+ ssock = __mptcp_nmpc_socket(msk);
+ if (IS_ERR(ssock))
+ return PTR_ERR(ssock);
+ }
+ if (!msk->first)
+ return -EINVAL;
+
+ ssk = msk->first;
+
lock_sock(ssk);
msg->msg_flags |= MSG_DONTWAIT;
msk->connect_flags = O_NONBLOCK;
@@ -1691,6 +1722,7 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh
} else if (ret && ret != -EINPROGRESS) {
mptcp_disconnect(sk, 0);
}
+ inet_sk(sk)->defer_connect = 0;
return ret;
}
@@ -1699,7 +1731,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct page_frag *pfrag;
- struct socket *ssock;
size_t copied = 0;
int ret = 0;
long timeo;
@@ -1709,12 +1740,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (unlikely(ssock && (inet_sk(ssock->sk)->defer_connect ||
- msg->msg_flags & MSG_FASTOPEN))) {
+ if (unlikely(inet_sk(sk)->defer_connect || msg->msg_flags & MSG_FASTOPEN)) {
int copied_syn = 0;
- ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn);
+ ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn);
copied += copied_syn;
if (ret == -EINPROGRESS && copied_syn > 0)
goto out;
@@ -2486,15 +2515,15 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
/* Mirror the tcp_reset() error propagation */
switch (sk->sk_state) {
case TCP_SYN_SENT:
- sk->sk_err = ECONNREFUSED;
+ WRITE_ONCE(sk->sk_err, ECONNREFUSED);
break;
case TCP_CLOSE_WAIT:
- sk->sk_err = EPIPE;
+ WRITE_ONCE(sk->sk_err, EPIPE);
break;
case TCP_CLOSE:
return;
default:
- sk->sk_err = ECONNRESET;
+ WRITE_ONCE(sk->sk_err, ECONNRESET);
}
inet_sk_state_store(sk, TCP_CLOSE);
@@ -2734,10 +2763,6 @@ static int mptcp_init_sock(struct sock *sk)
if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
return -ENOMEM;
- ret = __mptcp_socket_create(mptcp_sk(sk));
- if (ret)
- return ret;
-
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
@@ -2942,10 +2967,13 @@ bool __mptcp_close(struct sock *sk, long timeout)
goto cleanup;
}
- if (mptcp_check_readable(msk)) {
- /* the msk has read data, do the MPTCP equivalent of TCP reset */
+ if (mptcp_check_readable(msk) || timeout < 0) {
+ /* If the msk has read data, or the caller explicitly ask it,
+ * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
+ */
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_do_fastclose(sk);
+ timeout = 0;
} else if (mptcp_close_state(sk)) {
__mptcp_wr_shutdown(sk);
}
@@ -3157,7 +3185,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
struct socket *listener;
struct sock *newsk;
- listener = __mptcp_nmpc_socket(msk);
+ listener = msk->subflow;
if (WARN_ON_ONCE(!listener)) {
*err = -EINVAL;
return NULL;
@@ -3377,7 +3405,7 @@ static int mptcp_get_port(struct sock *sk, unsigned short snum)
struct mptcp_sock *msk = mptcp_sk(sk);
struct socket *ssock;
- ssock = __mptcp_nmpc_socket(msk);
+ ssock = msk->subflow;
pr_debug("msk=%p, subflow=%p", msk, ssock);
if (WARN_ON_ONCE(!ssock))
return -EINVAL;
@@ -3565,8 +3593,8 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
int err = -EINVAL;
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock)
- return -EINVAL;
+ if (IS_ERR(ssock))
+ return PTR_ERR(ssock);
mptcp_token_destroy(msk);
inet_sk_state_store(sk, TCP_SYN_SENT);
@@ -3654,8 +3682,8 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
lock_sock(sock->sk);
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- err = -EINVAL;
+ if (IS_ERR(ssock)) {
+ err = PTR_ERR(ssock);
goto unlock;
}
@@ -3691,8 +3719,8 @@ static int mptcp_listen(struct socket *sock, int backlog)
lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- err = -EINVAL;
+ if (IS_ERR(ssock)) {
+ err = PTR_ERR(ssock);
goto unlock;
}
@@ -3723,7 +3751,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
pr_debug("msk=%p", msk);
- ssock = __mptcp_nmpc_socket(msk);
+ /* buggy applications can call accept on socket states other then LISTEN
+ * but no need to allocate the first subflow just to error out.
+ */
+ ssock = msk->subflow;
if (!ssock)
return -EINVAL;
@@ -3817,7 +3848,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
/* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
smp_rmb();
- if (sk->sk_err)
+ if (READ_ONCE(sk->sk_err))
mask |= EPOLLERR;
return mask;