Merge branch 'net-group-together-hot-data'

Eric Dumazet says: ==================== net: group together hot data While our recent structure reorganizations were focused on increasing max throughput, there is still an area where improvements are much needed. In many cases, a cpu handles one packet at a time, instead of a nice batch. Hardware interrupt. -> Software interrupt. -> Network/Protocol stacks. If the cpu was idle or busy in other layers, it has to pull many cache lines. This series adds a new net_hotdata structure, where some critical (and read-mostly) data used in rx and tx path is packed in a small number of cache lines. Synthetic benchmarks will not see much difference, but latency of single packet should improve. net_hodata current size on 64bit is 416 bytes, but might grow in the future. Also move RPS definitions to a new include file. ==================== Link: https://lore.kernel.org/r/20240306160031.874438-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
author: Jakub Kicinski <kuba@kernel.org> 2024-03-07 21:12:45 -0800
committer: Jakub Kicinski <kuba@kernel.org> 2024-03-07 21:12:46 -0800
commit: e8bb2ccff7216d520a7bc33c22484dafebe8147e (patch)
tree: 29b291f5ccbeb9324f98f48e352a84d311314f69 /include/net
parent: d3423ed9b89d27d0fd8386dbb9bee89728c92b87 (diff)
parent: ce7f49ab741591d83e33e56948bac2f12de6e14e (diff)
5 files changed, 182 insertions, 38 deletions
diff --git a/include/net/gro.h b/include/net/gro.h
index 2b58671a6549..d6fc8fbd3730 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -9,6 +9,7 @@
 #include <net/ip6_checksum.h>
 #include <linux/skbuff.h>
 #include <net/udp.h>
+#include <net/hotdata.h>
 
 struct napi_gro_cb {
 	union {
@@ -446,7 +447,7 @@ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb,
 {
 	list_add_tail(&skb->list, &napi->rx_list);
 	napi->rx_count += segs;
-	if (napi->rx_count >= READ_ONCE(gro_normal_batch))
+	if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch))
 		gro_normal_list(napi);
 }
 
@@ -493,6 +494,4 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *
 #endif
 }
 
-extern struct list_head offload_base;
-
 #endif /* _NET_IPV6_GRO_H */
diff --git a/include/net/hotdata.h b/include/net/hotdata.h
new file mode 100644
index 000000000000..003667a1efd6
--- /dev/null
+++ b/include/net/hotdata.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_HOTDATA_H
+#define _NET_HOTDATA_H
+
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <net/protocol.h>
+
+/* Read mostly data used in network fast paths. */
+struct net_hotdata {
+#if IS_ENABLED(CONFIG_INET)
+	struct packet_offload	ip_packet_offload;
+	struct net_offload	tcpv4_offload;
+	struct net_protocol	tcp_protocol;
+	struct net_offload 	udpv4_offload;
+	struct net_protocol	udp_protocol;
+	struct packet_offload	ipv6_packet_offload;
+	struct net_offload	tcpv6_offload;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet6_protocol	tcpv6_protocol;
+	struct inet6_protocol	udpv6_protocol;
+#endif
+	struct net_offload	udpv6_offload;
+#endif
+	struct list_head	offload_base;
+	struct list_head	ptype_all;
+	struct kmem_cache	*skbuff_cache;
+	struct kmem_cache	*skbuff_fclone_cache;
+	struct kmem_cache	*skb_small_head_cache;
+#ifdef CONFIG_RPS
+	struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+	u32			rps_cpu_mask;
+#endif
+	int			gro_normal_batch;
+	int			netdev_budget;
+	int			netdev_budget_usecs;
+	int			tstamp_prequeue;
+	int			max_backlog;
+	int			dev_tx_weight;
+	int			dev_rx_weight;
+};
+
+#define inet_ehash_secret	net_hotdata.tcp_protocol.secret
+#define udp_ehash_secret	net_hotdata.udp_protocol.secret
+#define inet6_ehash_secret	net_hotdata.tcpv6_protocol.secret
+#define tcp_ipv6_hash_secret	net_hotdata.tcpv6_offload.secret
+#define udp6_ehash_secret	net_hotdata.udpv6_protocol.secret
+#define udp_ipv6_hash_secret	net_hotdata.udpv6_offload.secret
+
+extern struct net_hotdata net_hotdata;
+
+#endif /* _NET_HOTDATA_H */
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 6aef8cb11cc8..b2499f88f8f8 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -46,6 +46,7 @@ struct net_protocol {
 				 * socket lookup?
 				 */
 				icmp_strict_tag_validation:1;
+	u32			secret;
 };
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -59,6 +60,7 @@ struct inet6_protocol {
 			       __be32 info);
 
 	unsigned int	flags;	/* INET6_PROTO_xxx */
+	u32		secret;
 };
 
 #define INET6_PROTO_NOPOLICY	0x1
@@ -68,6 +70,7 @@ struct inet6_protocol {
 struct net_offload {
 	struct offload_callbacks callbacks;
 	unsigned int		 flags;	/* Flags used by IPv6 for now */
+	u32			 secret;
 };
 /* This should be set for any extension header which is compatible with GSO. */
 #define INET6_PROTO_GSO_EXTHDR	0x1
diff --git a/include/net/rps.h b/include/net/rps.h
new file mode 100644
index 000000000000..7660243e905b
--- /dev/null
+++ b/include/net/rps.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_RPS_H
+#define _NET_RPS_H
+
+#include <linux/types.h>
+#include <linux/static_key.h>
+#include <net/sock.h>
+#include <net/hotdata.h>
+
+#ifdef CONFIG_RPS
+
+extern struct static_key_false rps_needed;
+extern struct static_key_false rfs_needed;
+
+/*
+ * This structure holds an RPS map which can be of variable length.  The
+ * map is an array of CPUs.
+ */
+struct rps_map {
+	unsigned int	len;
+	struct rcu_head	rcu;
+	u16		cpus[];
+};
+#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))
+
+/*
+ * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
+ * tail pointer for that CPU's input queue at the time of last enqueue, and
+ * a hardware filter index.
+ */
+struct rps_dev_flow {
+	u16		cpu;
+	u16		filter;
+	unsigned int	last_qtail;
+};
+#define RPS_NO_FILTER 0xffff
+
+/*
+ * The rps_dev_flow_table structure contains a table of flow mappings.
+ */
+struct rps_dev_flow_table {
+	unsigned int		mask;
+	struct rcu_head		rcu;
+	struct rps_dev_flow	flows[];
+};
+#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
+    ((_num) * sizeof(struct rps_dev_flow)))
+
+/*
+ * The rps_sock_flow_table contains mappings of flows to the last CPU
+ * on which they were processed by the application (set in recvmsg).
+ * Each entry is a 32bit value. Upper part is the high-order bits
+ * of flow hash, lower part is CPU number.
+ * rps_cpu_mask is used to partition the space, depending on number of
+ * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
+ * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
+ * meaning we use 32-6=26 bits for the hash.
+ */
+struct rps_sock_flow_table {
+	u32	mask;
+
+	u32	ents[] ____cacheline_aligned_in_smp;
+};
+#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
+
+#define RPS_NO_CPU 0xffff
+
+static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
+					u32 hash)
+{
+	unsigned int index = hash & table->mask;
+	u32 val = hash & ~net_hotdata.rps_cpu_mask;
+
+	/* We only give a hint, preemption can change CPU under us */
+	val |= raw_smp_processor_id();
+
+	/* The following WRITE_ONCE() is paired with the READ_ONCE()
+	 * here, and another one in get_rps_cpu().
+	 */
+	if (READ_ONCE(table->ents[index]) != val)
+		WRITE_ONCE(table->ents[index], val);
+}
+
+#endif /* CONFIG_RPS */
+
+static inline void sock_rps_record_flow_hash(__u32 hash)
+{
+#ifdef CONFIG_RPS
+	struct rps_sock_flow_table *sock_flow_table;
+
+	if (!hash)
+		return;
+	rcu_read_lock();
+	sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
+	if (sock_flow_table)
+		rps_record_sock_flow(sock_flow_table, hash);
+	rcu_read_unlock();
+#endif
+}
+
+static inline void sock_rps_record_flow(const struct sock *sk)
+{
+#ifdef CONFIG_RPS
+	if (static_branch_unlikely(&rfs_needed)) {
+		/* Reading sk->sk_rxhash might incur an expensive cache line
+		 * miss.
+		 *
+		 * TCP_ESTABLISHED does cover almost all states where RFS
+		 * might be useful, and is cheaper [1] than testing :
+		 *	IPv4: inet_sk(sk)->inet_daddr
+		 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+		 * OR	an additional socket flag
+		 * [1] : sk_state and sk_prot are in the same cache line.
+		 */
+		if (sk->sk_state == TCP_ESTABLISHED) {
+			/* This READ_ONCE() is paired with the WRITE_ONCE()
+			 * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
+			 */
+			sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
+		}
+	}
+#endif
+}
+
+#endif /* _NET_RPS_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 09a0cde8bf52..b5e00702acc1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1117,41 +1117,6 @@ static inline void sk_incoming_cpu_update(struct sock *sk)
 		WRITE_ONCE(sk->sk_incoming_cpu, cpu);
 }
 
-static inline void sock_rps_record_flow_hash(__u32 hash)
-{
-#ifdef CONFIG_RPS
-	struct rps_sock_flow_table *sock_flow_table;
-
-	rcu_read_lock();
-	sock_flow_table = rcu_dereference(rps_sock_flow_table);
-	rps_record_sock_flow(sock_flow_table, hash);
-	rcu_read_unlock();
-#endif
-}
-
-static inline void sock_rps_record_flow(const struct sock *sk)
-{
-#ifdef CONFIG_RPS
-	if (static_branch_unlikely(&rfs_needed)) {
-		/* Reading sk->sk_rxhash might incur an expensive cache line
-		 * miss.
-		 *
-		 * TCP_ESTABLISHED does cover almost all states where RFS
-		 * might be useful, and is cheaper [1] than testing :
-		 *	IPv4: inet_sk(sk)->inet_daddr
-		 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
-		 * OR	an additional socket flag
-		 * [1] : sk_state and sk_prot are in the same cache line.
-		 */
-		if (sk->sk_state == TCP_ESTABLISHED) {
-			/* This READ_ONCE() is paired with the WRITE_ONCE()
-			 * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
-			 */
-			sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
-		}
-	}
-#endif
-}
 
 static inline void sock_rps_save_rxhash(struct sock *sk,
 					const struct sk_buff *skb)
author	Jakub Kicinski <kuba@kernel.org>	2024-03-07 21:12:45 -0800
committer	Jakub Kicinski <kuba@kernel.org>	2024-03-07 21:12:46 -0800
commit	e8bb2ccff7216d520a7bc33c22484dafebe8147e (patch)
tree	29b291f5ccbeb9324f98f48e352a84d311314f69 /include/net
parent	d3423ed9b89d27d0fd8386dbb9bee89728c92b87 (diff)
parent	ce7f49ab741591d83e33e56948bac2f12de6e14e (diff)