summaryrefslogtreecommitdiff
path: root/net/ipv4/udp.c
diff options
context:
space:
mode:
authorKuniyuki Iwashima <kuniyu@amazon.com>2022-11-14 13:57:57 -0800
committerDavid S. Miller <davem@davemloft.net>2022-11-16 09:43:35 +0000
commit9804985bf27f8fbcf0d96c7435b5ad94a2a6ea20 (patch)
tree618c3aae6090f4aec32aaf69683f4660fc9e35f5 /net/ipv4/udp.c
parentba6aac1516779dd0ced22c136a2c2c4a9c70cf29 (diff)
udp: Introduce optional per-netns hash table.
The maximum hash table size is 64K due to the nature of the protocol. [0] It's smaller than TCP, and fewer sockets can cause a performance drop. On an EC2 c5.24xlarge instance (192 GiB memory), after running iperf3 in different netns, creating 32Mi sockets without data transfer in the root netns causes regression for the iperf3's connection. uhash_entries sockets length Gbps 64K 1 1 5.69 1Mi 16 5.27 2Mi 32 4.90 4Mi 64 4.09 8Mi 128 2.96 16Mi 256 2.06 32Mi 512 1.12 The per-netns hash table breaks the lengthy lists into shorter ones. It is useful on a multi-tenant system with thousands of netns. With smaller hash tables, we can look up sockets faster, isolate noisy neighbours, and reduce lock contention. The max size of the per-netns table is 64K as well. This is because the possible hash range by udp_hashfn() always fits in 64K within the same netns and we cannot make full use of the whole buckets larger than 64K. /* 0 < num < 64K -> X < hash < X + 64K */ (num + net_hash_mix(net)) & mask; Also, the min size is 128. We use a bitmap to search for an available port in udp_lib_get_port(). To keep the bitmap on the stack and not fire the CONFIG_FRAME_WARN error at build time, we round up the table size to 128. The sysctl usage is the same with TCP: $ dmesg | cut -d ' ' -f 6- | grep "UDP hash" UDP hash table entries: 65536 (order: 9, 2097152 bytes, vmalloc) # sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 65536 # can be changed by uhash_entries # sysctl net.ipv4.udp_child_hash_entries net.ipv4.udp_child_hash_entries = 0 # disabled by default # ip netns add test1 # ip netns exec test1 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = -65536 # share the global table # sysctl -w net.ipv4.udp_child_hash_entries=100 net.ipv4.udp_child_hash_entries = 100 # ip netns add test2 # ip netns exec test2 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 128 # own a per-netns table with 2^n buckets We could optimise the hash table lookup/iteration further by removing the netns comparison for the per-netns one in the future. Also, we could optimise the sparse udp_hslot layout by putting it in udp_table. [0]: https://lore.kernel.org/netdev/4ACC2815.7010101@gmail.com/ Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/udp.c')
-rw-r--r--net/ipv4/udp.c101
1 files changed, 95 insertions, 6 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 37e79158d145..1fb7d1ed1cb1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -129,7 +129,7 @@ DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
#define MAX_UDP_PORTS 65536
-#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
static struct udp_table *udp_get_table_prot(struct sock *sk)
{
@@ -3277,7 +3277,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
&table->log,
&table->mask,
UDP_HTABLE_SIZE_MIN,
- 64 * 1024);
+ UDP_HTABLE_SIZE_MAX);
table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) {
@@ -3302,22 +3302,111 @@ u32 udp_flow_hashrnd(void)
}
EXPORT_SYMBOL(udp_flow_hashrnd);
-static int __net_init udp_sysctl_init(struct net *net)
+static void __net_init udp_sysctl_init(struct net *net)
{
- net->ipv4.udp_table = &udp_table;
-
net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
#ifdef CONFIG_NET_L3_MASTER_DEV
net->ipv4.sysctl_udp_l3mdev_accept = 0;
#endif
+}
+
+static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
+{
+ struct udp_table *udptable;
+ int i;
+
+ udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
+ if (!udptable)
+ goto out;
+
+ udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot),
+ GFP_KERNEL_ACCOUNT);
+ if (!udptable->hash)
+ goto free_table;
+
+ udptable->hash2 = udptable->hash + hash_entries;
+ udptable->mask = hash_entries - 1;
+ udptable->log = ilog2(hash_entries);
+
+ for (i = 0; i < hash_entries; i++) {
+ INIT_HLIST_HEAD(&udptable->hash[i].head);
+ udptable->hash[i].count = 0;
+ spin_lock_init(&udptable->hash[i].lock);
+
+ INIT_HLIST_HEAD(&udptable->hash2[i].head);
+ udptable->hash2[i].count = 0;
+ spin_lock_init(&udptable->hash2[i].lock);
+ }
+
+ return udptable;
+
+free_table:
+ kfree(udptable);
+out:
+ return NULL;
+}
+
+static void __net_exit udp_pernet_table_free(struct net *net)
+{
+ struct udp_table *udptable = net->ipv4.udp_table;
+
+ if (udptable == &udp_table)
+ return;
+
+ kvfree(udptable->hash);
+ kfree(udptable);
+}
+
+static void __net_init udp_set_table(struct net *net)
+{
+ struct udp_table *udptable;
+ unsigned int hash_entries;
+ struct net *old_net;
+
+ if (net_eq(net, &init_net))
+ goto fallback;
+
+ old_net = current->nsproxy->net_ns;
+ hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
+ if (!hash_entries)
+ goto fallback;
+
+ /* Set min to keep the bitmap on stack in udp_lib_get_port() */
+ if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
+ hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
+ else
+ hash_entries = roundup_pow_of_two(hash_entries);
+
+ udptable = udp_pernet_table_alloc(hash_entries);
+ if (udptable) {
+ net->ipv4.udp_table = udptable;
+ } else {
+ pr_warn("Failed to allocate UDP hash table (entries: %u) "
+ "for a netns, fallback to the global one\n",
+ hash_entries);
+fallback:
+ net->ipv4.udp_table = &udp_table;
+ }
+}
+
+static int __net_init udp_pernet_init(struct net *net)
+{
+ udp_sysctl_init(net);
+ udp_set_table(net);
return 0;
}
+static void __net_exit udp_pernet_exit(struct net *net)
+{
+ udp_pernet_table_free(net);
+}
+
static struct pernet_operations __net_initdata udp_sysctl_ops = {
- .init = udp_sysctl_init,
+ .init = udp_pernet_init,
+ .exit = udp_pernet_exit,
};
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)