summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_ipv4.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
commit8be4d31cb8aaeea27bde4b7ddb26e28a89062ebf (patch)
treefec3039a08284cd87f4ec9c3bea5b5a439f1859f /net/ipv4/tcp_ipv4.c
parent4b290aae788e06561754b28c6842e4080957d3f7 (diff)
parentfa582ca7e187a15e772e6a72fe035f649b387a60 (diff)
Merge tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Wrap datapath globals into net_aligned_data, to avoid false sharing - Preserve MSG_ZEROCOPY in forwarding (e.g. out of a container) - Add SO_INQ and SCM_INQ support to AF_UNIX - Add SIOCINQ support to AF_VSOCK - Add TCP_MAXSEG sockopt to MPTCP - Add IPv6 force_forwarding sysctl to enable forwarding per interface - Make TCP validation of whether packet fully fits in the receive window and the rcv_buf more strict. With increased use of HW aggregation a single "packet" can be multiple 100s of kB - Add MSG_MORE flag to optimize large TCP transmissions via sockmap, improves latency up to 33% for sockmap users - Convert TCP send queue handling from tasklet to BH workque - Improve BPF iteration over TCP sockets to see each socket exactly once - Remove obsolete and unused TCP RFC3517/RFC6675 loss recovery code - Support enabling kernel threads for NAPI processing on per-NAPI instance basis rather than a whole device. Fully stop the kernel NAPI thread when threaded NAPI gets disabled. Previously thread would stick around until ifdown due to tricky synchronization - Allow multicast routing to take effect on locally-generated packets - Add output interface argument for End.X in segment routing - MCTP: add support for gateway routing, improve bind() handling - Don't require rtnl_lock when fetching an IPv6 neighbor over Netlink - Add a new neighbor flag ("extern_valid"), which cedes refresh responsibilities to userspace. This is needed for EVPN multi-homing where a neighbor entry for a multi-homed host needs to be synced across all the VTEPs among which the host is multi-homed - Support NUD_PERMANENT for proxy neighbor entries - Add a new queuing discipline for IETF RFC9332 DualQ Coupled AQM - Add sequence numbers to netconsole messages. Unregister netconsole's console when all net targets are removed. Code refactoring. Add a number of selftests - Align IPSec inbound SA lookup to RFC 4301. Only SPI and protocol should be used for an inbound SA lookup - Support inspecting ref_tracker state via DebugFS - Don't force bonding advertisement frames tx to ~333 ms boundaries. Add broadcast_neighbor option to send ARP/ND on all bonded links - Allow providing upcall pid for the 'execute' command in openvswitch - Remove DCCP support from Netfilter's conntrack - Disallow multiple packet duplications in the queuing layer - Prevent use of deprecated iptables code on PREEMPT_RT Driver API: - Support RSS and hashing configuration over ethtool Netlink - Add dedicated ethtool callbacks for getting and setting hashing fields - Add support for power budget evaluation strategy in PSE / Power-over-Ethernet. Generate Netlink events for overcurrent etc - Support DPLL phase offset monitoring across all device inputs. Support providing clock reference and SYNC over separate DPLL inputs - Support traffic classes in devlink rate API for bandwidth management - Remove rtnl_lock dependency from UDP tunnel port configuration Device drivers: - Add a new Broadcom driver for 800G Ethernet (bnge) - Add a standalone driver for Microchip ZL3073x DPLL - Remove IBM's NETIUCV device driver - Ethernet high-speed NICs: - Broadcom (bnxt): - support zero-copy Tx of DMABUF memory - take page size into account for page pool recycling rings - Intel (100G, ice, idpf): - idpf: XDP and AF_XDP support preparations - idpf: add flow steering - add link_down_events statistic - clean up the TSPLL code - preparations for live VM migration - nVidia/Mellanox: - support zero-copy Rx/Tx interfaces (DMABUF and io_uring) - optimize context memory usage for matchers - expose serial numbers in devlink info - support PCIe congestion metrics - Meta (fbnic): - add 25G, 50G, and 100G link modes to phylink - support dumping FW logs - Marvell/Cavium: - support for CN20K generation of the Octeon chips - Amazon: - add HW clock (without timestamping, just hypervisor time access) - Ethernet virtual: - VirtIO net: - support segmentation of UDP-tunnel-encapsulated packets - Google (gve): - support packet timestamping and clock synchronization - Microsoft vNIC: - add handler for device-originated servicing events - allow dynamic MSI-X vector allocation - support Tx bandwidth clamping - Ethernet NICs consumer, and embedded: - AMD: - amd-xgbe: hardware timestamping and PTP clock support - Broadcom integrated MACs (bcmgenet, bcmasp): - use napi_complete_done() return value to support NAPI polling - add support for re-starting auto-negotiation - Broadcom switches (b53): - support BCM5325 switches - add bcm63xx EPHY power control - Synopsys (stmmac): - lots of code refactoring and cleanups - TI: - icssg-prueth: read firmware-names from device tree - icssg: PRP offload support - Microchip: - lan78xx: convert to PHYLINK for improved PHY and MAC management - ksz: add KSZ8463 switch support - Intel: - support similar queue priority scheme in multi-queue and time-sensitive networking (taprio) - support packet pre-emption in both - RealTek (r8169): - enable EEE at 5Gbps on RTL8126 - Airoha: - add PPPoE offload support - MDIO bus controller for Airoha AN7583 - Ethernet PHYs: - support for the IPQ5018 internal GE PHY - micrel KSZ9477 switch-integrated PHYs: - add MDI/MDI-X control support - add RX error counters - add cable test support - add Signal Quality Indicator (SQI) reporting - dp83tg720: improve reset handling and reduce link recovery time - support bcm54811 (and its MII-Lite interface type) - air_en8811h: support resume/suspend - support PHY counters for QCA807x and QCA808x - support WoL for QCA807x - CAN drivers: - rcar_canfd: support for Transceiver Delay Compensation - kvaser: report FW versions via devlink dev info - WiFi: - extended regulatory info support (6 GHz) - add statistics and beacon monitor for Multi-Link Operation (MLO) - support S1G aggregation, improve S1G support - add Radio Measurement action fields - support per-radio RTS threshold - some work around how FIPS affects wifi, which was wrong (RC4 is used by TKIP, not only WEP) - improvements for unsolicited probe response handling - WiFi drivers: - RealTek (rtw88): - IBSS mode for SDIO devices - RealTek (rtw89): - BT coexistence for MLO/WiFi7 - concurrent station + P2P support - support for USB devices RTL8851BU/RTL8852BU - Intel (iwlwifi): - use embedded PNVM in (to be released) FW images to fix compatibility issues - many cleanups (unused FW APIs, PCIe code, WoWLAN) - some FIPS interoperability - MediaTek (mt76): - firmware recovery improvements - more MLO work - Qualcomm/Atheros (ath12k): - fix scan on multi-radio devices - more EHT/Wi-Fi 7 features - encapsulation/decapsulation offload - Broadcom (brcm80211): - support SDIO 43751 device - Bluetooth: - hci_event: add support for handling LE BIG Sync Lost event - ISO: add socket option to report packet seqnum via CMSG - ISO: support SCM_TIMESTAMPING for ISO TS - Bluetooth drivers: - intel_pcie: support Function Level Reset - nxpuart: add support for 4M baudrate - nxpuart: implement powerup sequence, reset, FW dump, and FW loading" * tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1742 commits) dpll: zl3073x: Fix build failure selftests: bpf: fix legacy netfilter options ipv6: annotate data-races around rt->fib6_nsiblings ipv6: fix possible infinite loop in fib6_info_uses_dev() ipv6: prevent infinite loop in rt6_nlmsg_size() ipv6: add a retry logic in net6_rt_notify() vrf: Drop existing dst reference in vrf_ip6_input_dst net/sched: taprio: align entry index attr validation with mqprio net: fsl_pq_mdio: use dev_err_probe selftests: rtnetlink.sh: remove esp4_offload after test vsock: remove unnecessary null check in vsock_getname() igb: xsk: solve negative overflow of nb_pkts in zerocopy mode stmmac: xsk: fix negative overflow of budget in zerocopy mode dt-bindings: ieee802154: Convert at86rf230.txt yaml format net: dsa: microchip: Disable PTP function of KSZ8463 net: dsa: microchip: Setup fiber ports for KSZ8463 net: dsa: microchip: Write switch MAC address differently for KSZ8463 net: dsa: microchip: Use different registers for KSZ8463 net: dsa: microchip: Add KSZ8463 switch support to KSZ DSA driver dt-bindings: net: dsa: microchip: Add KSZ8463 switch support ...
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
-rw-r--r--net/ipv4/tcp_ipv4.c309
1 files changed, 222 insertions, 87 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6a14f9e6fef6..84d3d556ed80 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -58,7 +58,9 @@
#include <linux/times.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sock_diag.h>
+#include <net/aligned_data.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
@@ -787,7 +789,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
- net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev);
+ net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
/* Invalid TCP option size or twice included auth */
if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
@@ -1703,7 +1705,6 @@ static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock),
- .rtx_syn_ack = tcp_rtx_synack,
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
@@ -2025,6 +2026,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
u32 gso_size;
u64 limit;
int delta;
+ int err;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops.
@@ -2135,21 +2137,27 @@ no_coalesce:
limit = min_t(u64, limit, UINT_MAX);
- if (unlikely(sk_add_backlog(sk, skb, limit))) {
+ err = sk_add_backlog(sk, skb, limit);
+ if (unlikely(err)) {
bh_unlock_sock(sk);
- *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+ if (err == -ENOMEM) {
+ *reason = SKB_DROP_REASON_PFMEMALLOC;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+ } else {
+ *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+ }
return true;
}
return false;
}
EXPORT_IPV6_MOD(tcp_add_backlog);
-int tcp_filter(struct sock *sk, struct sk_buff *skb)
+int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
{
struct tcphdr *th = (struct tcphdr *)skb->data;
- return sk_filter_trim_cap(sk, skb, th->doff * 4);
+ return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
}
EXPORT_IPV6_MOD(tcp_filter);
@@ -2276,14 +2284,12 @@ lookup:
}
refcounted = true;
nsk = NULL;
- if (!tcp_filter(sk, skb)) {
+ if (!tcp_filter(sk, skb, &drop_reason)) {
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
&drop_reason);
- } else {
- drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
}
if (!nsk) {
reqsk_put(req);
@@ -2339,10 +2345,9 @@ process:
nf_reset_ct(skb);
- if (tcp_filter(sk, skb)) {
- drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+ if (tcp_filter(sk, skb, &drop_reason))
goto discard_and_relse;
- }
+
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
@@ -2896,7 +2901,7 @@ static void get_openreq4(const struct request_sock *req,
jiffies_delta_to_clock_t(delta),
req->num_timeout,
from_kuid_munged(seq_user_ns(f),
- sock_i_uid(req->rsk_listener)),
+ sk_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0,
@@ -2954,7 +2959,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
icsk->icsk_retransmits,
- from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
+ from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
icsk->icsk_probes_out,
sock_i_ino(sk),
refcount_read(&sk->sk_refcnt), sk,
@@ -3014,13 +3019,17 @@ out:
}
#ifdef CONFIG_BPF_SYSCALL
+union bpf_tcp_iter_batch_item {
+ struct sock *sk;
+ __u64 cookie;
+};
+
struct bpf_tcp_iter_state {
struct tcp_iter_state state;
unsigned int cur_sk;
unsigned int end_sk;
unsigned int max_sk;
- struct sock **batch;
- bool st_bucket_done;
+ union bpf_tcp_iter_batch_item *batch;
};
struct bpf_iter__tcp {
@@ -3043,21 +3052,32 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
{
- while (iter->cur_sk < iter->end_sk)
- sock_gen_put(iter->batch[iter->cur_sk++]);
+ union bpf_tcp_iter_batch_item *item;
+ unsigned int cur_sk = iter->cur_sk;
+ __u64 cookie;
+
+ /* Remember the cookies of the sockets we haven't seen yet, so we can
+ * pick up where we left off next time around.
+ */
+ while (cur_sk < iter->end_sk) {
+ item = &iter->batch[cur_sk++];
+ cookie = sock_gen_cookie(item->sk);
+ sock_gen_put(item->sk);
+ item->cookie = cookie;
+ }
}
static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
- unsigned int new_batch_sz)
+ unsigned int new_batch_sz, gfp_t flags)
{
- struct sock **new_batch;
+ union bpf_tcp_iter_batch_item *new_batch;
new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
- GFP_USER | __GFP_NOWARN);
+ flags | __GFP_NOWARN);
if (!new_batch)
return -ENOMEM;
- bpf_iter_tcp_put_batch(iter);
+ memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
kvfree(iter->batch);
iter->batch = new_batch;
iter->max_sk = new_batch_sz;
@@ -3065,112 +3085,234 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
return 0;
}
-static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
- struct sock *start_sk)
+static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
+ union bpf_tcp_iter_batch_item *cookies,
+ int n_cookies)
+{
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+ int i;
+
+ for (i = 0; i < n_cookies; i++) {
+ sk = first_sk;
+ sk_nulls_for_each_from(sk, node)
+ if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
+ return sk;
+ }
+
+ return NULL;
+}
+
+static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ unsigned int find_cookie = iter->cur_sk;
+ unsigned int end_cookie = iter->end_sk;
+ int resume_bucket = st->bucket;
+ struct sock *sk;
+
+ if (end_cookie && find_cookie == end_cookie)
+ ++st->bucket;
+
+ sk = listening_get_first(seq);
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ if (sk && st->bucket == resume_bucket && end_cookie) {
+ sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+ if (!sk) {
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
+ ++st->bucket;
+ sk = listening_get_first(seq);
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
{
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct bpf_tcp_iter_state *iter = seq->private;
struct tcp_iter_state *st = &iter->state;
+ unsigned int find_cookie = iter->cur_sk;
+ unsigned int end_cookie = iter->end_sk;
+ int resume_bucket = st->bucket;
+ struct sock *sk;
+
+ if (end_cookie && find_cookie == end_cookie)
+ ++st->bucket;
+
+ sk = established_get_first(seq);
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ if (sk && st->bucket == resume_bucket && end_cookie) {
+ sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+ if (!sk) {
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+ ++st->bucket;
+ sk = established_get_first(seq);
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct sock *sk = NULL;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_LISTENING:
+ sk = bpf_iter_tcp_resume_listening(seq);
+ if (sk)
+ break;
+ st->bucket = 0;
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ fallthrough;
+ case TCP_SEQ_STATE_ESTABLISHED:
+ sk = bpf_iter_tcp_resume_established(seq);
+ break;
+ }
+
+ return sk;
+}
+
+static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
+ struct sock **start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
struct hlist_nulls_node *node;
unsigned int expected = 1;
struct sock *sk;
- sock_hold(start_sk);
- iter->batch[iter->end_sk++] = start_sk;
+ sock_hold(*start_sk);
+ iter->batch[iter->end_sk++].sk = *start_sk;
- sk = sk_nulls_next(start_sk);
+ sk = sk_nulls_next(*start_sk);
+ *start_sk = NULL;
sk_nulls_for_each_from(sk, node) {
if (seq_sk_match(seq, sk)) {
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
- iter->batch[iter->end_sk++] = sk;
+ iter->batch[iter->end_sk++].sk = sk;
+ } else if (!*start_sk) {
+ /* Remember where we left off. */
+ *start_sk = sk;
}
expected++;
}
}
- spin_unlock(&hinfo->lhash2[st->bucket].lock);
return expected;
}
static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
- struct sock *start_sk)
+ struct sock **start_sk)
{
- struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct bpf_tcp_iter_state *iter = seq->private;
- struct tcp_iter_state *st = &iter->state;
struct hlist_nulls_node *node;
unsigned int expected = 1;
struct sock *sk;
- sock_hold(start_sk);
- iter->batch[iter->end_sk++] = start_sk;
+ sock_hold(*start_sk);
+ iter->batch[iter->end_sk++].sk = *start_sk;
- sk = sk_nulls_next(start_sk);
+ sk = sk_nulls_next(*start_sk);
+ *start_sk = NULL;
sk_nulls_for_each_from(sk, node) {
if (seq_sk_match(seq, sk)) {
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
- iter->batch[iter->end_sk++] = sk;
+ iter->batch[iter->end_sk++].sk = sk;
+ } else if (!*start_sk) {
+ /* Remember where we left off. */
+ *start_sk = sk;
}
expected++;
}
}
- spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
return expected;
}
-static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
+static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
+ struct sock **start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+
+ if (st->state == TCP_SEQ_STATE_LISTENING)
+ return bpf_iter_tcp_listening_batch(seq, start_sk);
+ else
+ return bpf_iter_tcp_established_batch(seq, start_sk);
+}
+
+static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
{
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct bpf_tcp_iter_state *iter = seq->private;
struct tcp_iter_state *st = &iter->state;
+
+ if (st->state == TCP_SEQ_STATE_LISTENING)
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
+ else
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+}
+
+static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
unsigned int expected;
- bool resized = false;
struct sock *sk;
+ int err;
- /* The st->bucket is done. Directly advance to the next
- * bucket instead of having the tcp_seek_last_pos() to skip
- * one by one in the current bucket and eventually find out
- * it has to advance to the next bucket.
- */
- if (iter->st_bucket_done) {
- st->offset = 0;
- st->bucket++;
- if (st->state == TCP_SEQ_STATE_LISTENING &&
- st->bucket > hinfo->lhash2_mask) {
- st->state = TCP_SEQ_STATE_ESTABLISHED;
- st->bucket = 0;
- }
- }
+ sk = bpf_iter_tcp_resume(seq);
+ if (!sk)
+ return NULL; /* Done */
-again:
- /* Get a new batch */
- iter->cur_sk = 0;
- iter->end_sk = 0;
- iter->st_bucket_done = false;
+ expected = bpf_iter_fill_batch(seq, &sk);
+ if (likely(iter->end_sk == expected))
+ goto done;
- sk = tcp_seek_last_pos(seq);
+ /* Batch size was too small. */
+ bpf_iter_tcp_unlock_bucket(seq);
+ bpf_iter_tcp_put_batch(iter);
+ err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
+ GFP_USER);
+ if (err)
+ return ERR_PTR(err);
+
+ sk = bpf_iter_tcp_resume(seq);
if (!sk)
return NULL; /* Done */
- if (st->state == TCP_SEQ_STATE_LISTENING)
- expected = bpf_iter_tcp_listening_batch(seq, sk);
- else
- expected = bpf_iter_tcp_established_batch(seq, sk);
-
- if (iter->end_sk == expected) {
- iter->st_bucket_done = true;
- return sk;
- }
+ expected = bpf_iter_fill_batch(seq, &sk);
+ if (likely(iter->end_sk == expected))
+ goto done;
- if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
- resized = true;
- goto again;
+ /* Batch size was still too small. Hold onto the lock while we try
+ * again with a larger batch to make sure the current bucket's size
+ * does not change in the meantime.
+ */
+ err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
+ if (err) {
+ bpf_iter_tcp_unlock_bucket(seq);
+ return ERR_PTR(err);
}
- return sk;
+ expected = bpf_iter_fill_batch(seq, &sk);
+ WARN_ON_ONCE(iter->end_sk != expected);
+done:
+ bpf_iter_tcp_unlock_bucket(seq);
+ return iter->batch[0].sk;
}
static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
@@ -3200,16 +3342,11 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
* meta.seq_num is used instead.
*/
st->num++;
- /* Move st->offset to the next sk in the bucket such that
- * the future start() will resume at st->offset in
- * st->bucket. See tcp_seek_last_pos().
- */
- st->offset++;
- sock_gen_put(iter->batch[iter->cur_sk++]);
+ sock_gen_put(iter->batch[iter->cur_sk++].sk);
}
if (iter->cur_sk < iter->end_sk)
- sk = iter->batch[iter->cur_sk];
+ sk = iter->batch[iter->cur_sk].sk;
else
sk = bpf_iter_tcp_batch(seq);
@@ -3246,9 +3383,9 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
const struct request_sock *req = v;
uid = from_kuid_munged(seq_user_ns(seq),
- sock_i_uid(req->rsk_listener));
+ sk_uid(req->rsk_listener));
} else {
- uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+ uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
}
meta.seq = seq;
@@ -3275,10 +3412,8 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
(void)tcp_prog_seq_show(prog, &meta, v, 0);
}
- if (iter->cur_sk < iter->end_sk) {
+ if (iter->cur_sk < iter->end_sk)
bpf_iter_tcp_put_batch(iter);
- iter->st_bucket_done = false;
- }
}
static const struct seq_operations bpf_iter_tcp_seq_ops = {
@@ -3391,7 +3526,7 @@ struct proto tcp_prot = {
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
- .memory_allocated = &tcp_memory_allocated,
+ .memory_allocated = &net_aligned_data.tcp_memory_allocated,
.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
.memory_pressure = &tcp_memory_pressure,
@@ -3596,7 +3731,7 @@ static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
if (err)
return err;
- err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
+ err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
if (err) {
bpf_iter_fini_seq_net(priv_data);
return err;