summaryrefslogtreecommitdiff
path: root/net/ipv4/af_inet.c
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2022-08-24 19:30:19 -0700
committerJakub Kicinski <kuba@kernel.org>2022-08-24 19:30:19 -0700
commitc21e1bf4d8104fb77bbd99f3d1276c0d7c9b28d9 (patch)
tree73ff51aa3bc284c611e346b14f4ebe8b90f251b8 /net/ipv4/af_inet.c
parent0bf73255d3a3cf3b0416e95f2c9f7c53095c2e1a (diff)
parent1be9ac87a75a4fc0e2cc254e412d2d67a58a7191 (diff)
Merge branch 'add-a-second-bind-table-hashed-by-port-and-address'
Joanne Koong says: ==================== Add a second bind table hashed by port and address Currently, there is one bind hashtable (bhash) that hashes by port only. This patchset adds a second bind table (bhash2) that hashes by port and address. The motivation for adding bhash2 is to expedite bind requests in situations where the port has many sockets in its bhash table entry (eg a large number of sockets bound to different addresses on the same port), which makes checking bind conflicts costly especially given that we acquire the table entry spinlock while doing so, which can cause softirq cpu lockups and can prevent new tcp connections. We ran into this problem at Meta where the traffic team binds a large number of IPs to port 443 and the bind() call took a significant amount of time which led to cpu softirq lockups, which caused packet drops and other failures on the machine. When experimentally testing this on a local server for ~24k sockets bound to the port, the results seen were: ipv4: before - 0.002317 seconds with bhash2 - 0.000020 seconds ipv6: before - 0.002431 seconds with bhash2 - 0.000021 seconds The additions to the initial bhash2 submission [0] are: * Updating bhash2 in the cases where a socket's rcv saddr changes after it has * been bound * Adding locks for bhash2 hashbuckets [0] https://lore.kernel.org/netdev/20220520001834.2247810-1-kuba@kernel.org/ v3: https://lore.kernel.org/netdev/20220722195406.1304948-2-joannelkoong@gmail.com/ v2: https://lore.kernel.org/netdev/20220712235310.1935121-1-joannelkoong@gmail.com/ v1: https://lore.kernel.org/netdev/20220623234242.2083895-2-joannelkoong@gmail.com/ ==================== Link: https://lore.kernel.org/r/20220822181023.3979645-1-joannelkoong@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/ipv4/af_inet.c')
-rw-r--r--net/ipv4/af_inet.c26
1 files changed, 21 insertions, 5 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3ca0cc467886..2e6a3cb06815 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1219,6 +1219,7 @@ EXPORT_SYMBOL(inet_unregister_protosw);
static int inet_sk_reselect_saddr(struct sock *sk)
{
+ struct inet_bind_hashbucket *prev_addr_hashbucket;
struct inet_sock *inet = inet_sk(sk);
__be32 old_saddr = inet->inet_saddr;
__be32 daddr = inet->inet_daddr;
@@ -1226,6 +1227,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
struct rtable *rt;
__be32 new_saddr;
struct ip_options_rcu *inet_opt;
+ int err;
inet_opt = rcu_dereference_protected(inet->inet_opt,
lockdep_sock_is_held(sk));
@@ -1240,20 +1242,34 @@ static int inet_sk_reselect_saddr(struct sock *sk)
if (IS_ERR(rt))
return PTR_ERR(rt);
- sk_setup_caps(sk, &rt->dst);
-
new_saddr = fl4->saddr;
- if (new_saddr == old_saddr)
+ if (new_saddr == old_saddr) {
+ sk_setup_caps(sk, &rt->dst);
return 0;
+ }
+
+ prev_addr_hashbucket =
+ inet_bhashfn_portaddr(sk->sk_prot->h.hashinfo, sk,
+ sock_net(sk), inet->inet_num);
+
+ inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
+
+ err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
+ if (err) {
+ inet->inet_saddr = old_saddr;
+ inet->inet_rcv_saddr = old_saddr;
+ ip_rt_put(rt);
+ return err;
+ }
+
+ sk_setup_caps(sk, &rt->dst);
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
__func__, &old_saddr, &new_saddr);
}
- inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
-
/*
* XXX The only one ugly spot where we need to
* XXX really change the sockets identity after