summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.rst29
-rw-r--r--drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c5
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c5
-rw-r--r--drivers/net/ethernet/netronome/nfp/crypto/tls.c5
-rw-r--r--include/net/inet_hashtables.h16
-rw-r--r--include/net/netns/ipv4.h4
-rw-r--r--include/net/tcp.h1
-rw-r--r--net/core/filter.c5
-rw-r--r--net/dccp/proto.c2
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/esp4.c3
-rw-r--r--net/ipv4/inet_connection_sock.c22
-rw-r--r--net/ipv4/inet_hashtables.c92
-rw-r--r--net/ipv4/inet_timewait_sock.c4
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c16
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c47
-rw-r--r--net/ipv4/tcp.c1
-rw-r--r--net/ipv4/tcp_diag.c18
-rw-r--r--net/ipv4/tcp_ipv4.c143
-rw-r--r--net/ipv4/tcp_minisocks.c28
-rw-r--r--net/ipv6/esp6.c3
-rw-r--r--net/ipv6/inet6_hashtables.c4
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c4
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c8
-rw-r--r--net/ipv6/tcp_ipv6.c42
-rw-r--r--net/mptcp/mptcp_diag.c7
28 files changed, 361 insertions, 161 deletions
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index a759872a2883..e7b3fa7bb3f7 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1040,6 +1040,35 @@ tcp_challenge_ack_limit - INTEGER
TCP stack implements per TCP socket limits anyway.
Default: INT_MAX (unlimited)
+tcp_ehash_entries - INTEGER
+ Show the number of hash buckets for TCP sockets in the current
+ networking namespace.
+
+ A negative value means the networking namespace does not own its
+ hash buckets and shares the initial networking namespace's one.
+
+tcp_child_ehash_entries - INTEGER
+ Control the number of hash buckets for TCP sockets in the child
+ networking namespace, which must be set before clone() or unshare().
+
+ If the value is not 0, the kernel uses a value rounded up to 2^n
+ as the actual hash bucket size. 0 is a special value, meaning
+ the child networking namespace will share the initial networking
+ namespace's hash buckets.
+
+ Note that the child will use the global one in case the kernel
+ fails to allocate enough memory. In addition, the global hash
+ buckets are spread over available NUMA nodes, but the allocation
+ of the child hash table depends on the current process's NUMA
+ policy, which could result in performance differences.
+
+ Note also that the default value of tcp_max_tw_buckets and
+ tcp_max_syn_backlog depend on the hash bucket size.
+
+ Possible values: 0, 2^n (n: 0 - 24 (16Mi))
+
+ Default: 0
+
UDP variables
=============
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
index ddfe9208529a..f90bfba4b303 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
@@ -1069,8 +1069,7 @@ static void chtls_pass_accept_rpl(struct sk_buff *skb,
cxgb4_l2t_send(csk->egress_dev, skb, csk->l2t_entry);
}
-static void inet_inherit_port(struct inet_hashinfo *hash_info,
- struct sock *lsk, struct sock *newsk)
+static void inet_inherit_port(struct sock *lsk, struct sock *newsk)
{
local_bh_disable();
__inet_inherit_port(lsk, newsk);
@@ -1240,7 +1239,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk,
ipv4.sysctl_tcp_window_scaling),
tp->window_clamp);
neigh_release(n);
- inet_inherit_port(&tcp_hashinfo, lsk, newsk);
+ inet_inherit_port(lsk, newsk);
csk_set_flag(csk, CSK_CONN_INLINE);
bh_unlock_sock(newsk); /* tcp_create_openreq_child ->sk_clone_lock */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
index 13145ecaf839..5203393adf88 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
@@ -461,6 +461,7 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb)
{
struct ethhdr *eth = (struct ethhdr *)(skb->data);
struct net_device *netdev = rq->netdev;
+ struct net *net = dev_net(netdev);
struct sock *sk = NULL;
unsigned int datalen;
struct iphdr *iph;
@@ -475,7 +476,7 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb)
depth += sizeof(struct iphdr);
th = (void *)iph + sizeof(struct iphdr);
- sk = inet_lookup_established(dev_net(netdev), &tcp_hashinfo,
+ sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
iph->saddr, th->source, iph->daddr,
th->dest, netdev->ifindex);
#if IS_ENABLED(CONFIG_IPV6)
@@ -485,7 +486,7 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb)
depth += sizeof(struct ipv6hdr);
th = (void *)ipv6h + sizeof(struct ipv6hdr);
- sk = __inet6_lookup_established(dev_net(netdev), &tcp_hashinfo,
+ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
&ipv6h->saddr, th->source,
&ipv6h->daddr, ntohs(th->dest),
netdev->ifindex, 0);
diff --git a/drivers/net/ethernet/netronome/nfp/crypto/tls.c b/drivers/net/ethernet/netronome/nfp/crypto/tls.c
index 78368e71ce83..f80f1a6953fa 100644
--- a/drivers/net/ethernet/netronome/nfp/crypto/tls.c
+++ b/drivers/net/ethernet/netronome/nfp/crypto/tls.c
@@ -474,6 +474,7 @@ int nfp_net_tls_rx_resync_req(struct net_device *netdev,
{
struct nfp_net *nn = netdev_priv(netdev);
struct nfp_net_tls_offload_ctx *ntls;
+ struct net *net = dev_net(netdev);
struct ipv6hdr *ipv6h;
struct tcphdr *th;
struct iphdr *iph;
@@ -494,13 +495,13 @@ int nfp_net_tls_rx_resync_req(struct net_device *netdev,
switch (ipv6h->version) {
case 4:
- sk = inet_lookup_established(dev_net(netdev), &tcp_hashinfo,
+ sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
iph->saddr, th->source, iph->daddr,
th->dest, netdev->ifindex);
break;
#if IS_ENABLED(CONFIG_IPV6)
case 6:
- sk = __inet6_lookup_established(dev_net(netdev), &tcp_hashinfo,
+ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
&ipv6h->saddr, th->source,
&ipv6h->daddr, ntohs(th->dest),
netdev->ifindex, 0);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 44a419b9e3d5..9121ccab1fa1 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -168,8 +168,20 @@ struct inet_hashinfo {
/* The 2nd listener table hashed by local port and address */
unsigned int lhash2_mask;
struct inet_listen_hashbucket *lhash2;
+
+ bool pernet;
};
+static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IP_DCCP)
+ return sk->sk_prot->h.hashinfo ? :
+ sock_net(sk)->ipv4.tcp_death_row.hashinfo;
+#else
+ return sock_net(sk)->ipv4.tcp_death_row.hashinfo;
+#endif
+}
+
static inline struct inet_listen_hashbucket *
inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
{
@@ -204,6 +216,10 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
hashinfo->ehash_locks = NULL;
}
+struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
+ unsigned int ehash_entries);
+void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo);
+
struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 6320a76cefdc..1b8004679445 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -34,6 +34,7 @@ struct inet_hashinfo;
struct inet_timewait_death_row {
refcount_t tw_refcount;
+ /* Padding to avoid false sharing, tw_refcount can be often written */
struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp;
int sysctl_max_tw_buckets;
};
@@ -41,7 +42,7 @@ struct inet_timewait_death_row {
struct tcp_fastopen_context;
struct netns_ipv4 {
- struct inet_timewait_death_row *tcp_death_row;
+ struct inet_timewait_death_row tcp_death_row;
#ifdef CONFIG_SYSCTL
struct ctl_table_header *forw_hdr;
@@ -170,6 +171,7 @@ struct netns_ipv4 {
int sysctl_tcp_pacing_ca_ratio;
int sysctl_tcp_wmem[3];
int sysctl_tcp_rmem[3];
+ unsigned int sysctl_tcp_child_ehash_entries;
unsigned long sysctl_tcp_comp_sack_delay_ns;
unsigned long sysctl_tcp_comp_sack_slack_ns;
int sysctl_max_syn_backlog;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 735e957f7f4b..27e8d378c70a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -346,6 +346,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
+void tcp_twsk_purge(struct list_head *net_exit_list, int family);
ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags);
diff --git a/net/core/filter.c b/net/core/filter.c
index e872f45399b0..31608801078e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6373,6 +6373,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
{
+ struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
bool refcounted = false;
struct sock *sk = NULL;
@@ -6381,7 +6382,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
__be32 dst4 = tuple->ipv4.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0,
+ sk = __inet_lookup(net, hinfo, NULL, 0,
src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
dif, sdif, &refcounted);
@@ -6395,7 +6396,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0,
+ sk = __inet6_lookup(net, hinfo, NULL, 0,
src6, tuple->ipv6.sport,
dst6, ntohs(tuple->ipv6.dport),
dif, sdif, &refcounted);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 7cd4a6cc99fc..c548ca3e9b0e 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1197,6 +1197,8 @@ static int __init dccp_init(void)
INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
}
+ dccp_hashinfo.pernet = false;
+
rc = dccp_mib_init();
if (rc)
goto out_free_dccp_bhash2;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d3ab1ae32ef5..e2c219382345 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1250,7 +1250,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
}
prev_addr_hashbucket =
- inet_bhashfn_portaddr(sk->sk_prot->h.hashinfo, sk,
+ inet_bhashfn_portaddr(tcp_or_dccp_get_hashinfo(sk), sk,
sock_net(sk), inet->inet_num);
inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 5c03eba787e5..6be0f8903be1 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -134,6 +134,7 @@ static void esp_free_tcp_sk(struct rcu_head *head)
static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
{
struct xfrm_encap_tmpl *encap = x->encap;
+ struct net *net = xs_net(x);
struct esp_tcp_sk *esk;
__be16 sport, dport;
struct sock *nsk;
@@ -160,7 +161,7 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
}
spin_unlock_bh(&x->lock);
- sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4,
+ sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4,
dport, x->props.saddr.a4, sport, 0);
if (!sk)
return ERR_PTR(-ENOENT);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f0038043b661..ebca860e113f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -285,16 +285,14 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
struct inet_bind2_bucket **tb2_ret,
struct inet_bind_hashbucket **head2_ret, int *port_ret)
{
- struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
- int port = 0;
+ struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+ int i, low, high, attempt_half, port, l3mdev;
struct inet_bind_hashbucket *head, *head2;
struct net *net = sock_net(sk);
- bool relax = false;
- int i, low, high, attempt_half;
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
u32 remaining, offset;
- int l3mdev;
+ bool relax = false;
l3mdev = inet_sk_bound_l3mdev(sk);
ports_exhausted:
@@ -469,17 +467,16 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
+ struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
- struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
- int ret = 1, port = snum;
- struct net *net = sock_net(sk);
bool found_port = false, check_bind_conflict = true;
bool bhash_created = false, bhash2_created = false;
struct inet_bind_hashbucket *head, *head2;
struct inet_bind2_bucket *tb2 = NULL;
struct inet_bind_bucket *tb = NULL;
bool head2_lock_acquired = false;
- int l3mdev;
+ int ret = 1, port = snum, l3mdev;
+ struct net *net = sock_net(sk);
l3mdev = inet_sk_bound_l3mdev(sk);
@@ -909,14 +906,15 @@ static void reqsk_migrate_reset(struct request_sock *req)
/* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock *req)
{
- struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
+ struct sock *sk = req_to_sk(req);
bool found = false;
- if (sk_hashed(req_to_sk(req))) {
+ if (sk_hashed(sk)) {
+ struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
spin_lock(lock);
- found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
+ found = __sk_nulls_del_node_init_rcu(sk);
spin_unlock(lock);
}
if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 60d77e234a68..74e64aad5114 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -168,14 +168,15 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
*/
static void __inet_put_port(struct sock *sk)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
- hashinfo->bhash_size);
- struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
- struct inet_bind_hashbucket *head2 =
- inet_bhashfn_portaddr(hashinfo, sk, sock_net(sk),
- inet_sk(sk)->inet_num);
+ struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_bind_hashbucket *head, *head2;
+ struct net *net = sock_net(sk);
struct inet_bind_bucket *tb;
+ int bhash;
+
+ bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size);
+ head = &hashinfo->bhash[bhash];
+ head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num);
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
@@ -207,19 +208,19 @@ EXPORT_SYMBOL(inet_put_port);
int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
- struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
+ struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk);
unsigned short port = inet_sk(child)->inet_num;
- const int bhash = inet_bhashfn(sock_net(sk), port,
- table->bhash_size);
- struct inet_bind_hashbucket *head = &table->bhash[bhash];
- struct inet_bind_hashbucket *head2 =
- inet_bhashfn_portaddr(table, child, sock_net(sk), port);
+ struct inet_bind_hashbucket *head, *head2;
bool created_inet_bind_bucket = false;
- bool update_fastreuse = false;
struct net *net = sock_net(sk);
+ bool update_fastreuse = false;
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
- int l3mdev;
+ int bhash, l3mdev;
+
+ bhash = inet_bhashfn(net, port, table->bhash_size);
+ head = &table->bhash[bhash];
+ head2 = inet_bhashfn_portaddr(table, child, net, port);
spin_lock(&head->lock);
spin_lock(&head2->lock);
@@ -385,7 +386,7 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net,
struct sock *sk, *reuse_sk;
bool no_reuseport;
- if (hashinfo != &tcp_hashinfo)
+ if (hashinfo != net->ipv4.tcp_death_row.hashinfo)
return NULL; /* only TCP is supported */
no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport,
@@ -628,9 +629,9 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk,
*/
bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct hlist_nulls_head *list;
+ struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
struct inet_ehash_bucket *head;
+ struct hlist_nulls_head *list;
spinlock_t *lock;
bool ret = true;
@@ -700,7 +701,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
int __inet_hash(struct sock *sk, struct sock *osk)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
struct inet_listen_hashbucket *ilb2;
int err = 0;
@@ -746,7 +747,7 @@ EXPORT_SYMBOL_GPL(inet_hash);
void inet_unhash(struct sock *sk)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
if (sk_unhashed(sk))
return;
@@ -833,7 +834,7 @@ inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net
struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
{
- struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+ struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
u32 hash;
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr addr_any = {};
@@ -849,7 +850,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk)
{
- struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+ struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
struct inet_bind2_bucket *tb2, *new_tb2;
int l3mdev = inet_sk_bound_l3mdev(sk);
struct inet_bind_hashbucket *head2;
@@ -1144,3 +1145,50 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
return 0;
}
EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
+
+struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
+ unsigned int ehash_entries)
+{
+ struct inet_hashinfo *new_hashinfo;
+ int i;
+
+ new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
+ if (!new_hashinfo)
+ goto err;
+
+ new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
+ GFP_KERNEL_ACCOUNT);
+ if (!new_hashinfo->ehash)
+ goto free_hashinfo;
+
+ new_hashinfo->ehash_mask = ehash_entries - 1;
+
+ if (inet_ehash_locks_alloc(new_hashinfo))
+ goto free_ehash;
+
+ for (i = 0; i < ehash_entries; i++)
+ INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);
+
+ new_hashinfo->pernet = true;
+
+ return new_hashinfo;
+
+free_ehash:
+ vfree(new_hashinfo->ehash);
+free_hashinfo:
+ kfree(new_hashinfo);
+err:
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc);
+
+void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
+{
+ if (!hashinfo->pernet)
+ return;
+
+ inet_ehash_locks_free(hashinfo);
+ vfree(hashinfo->ehash);
+ kfree(hashinfo);
+}
+EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 47ccc343c9fb..71d3bb0abf6c 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -59,9 +59,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
- if (refcount_dec_and_test(&tw->tw_dr->tw_refcount))
- kfree(tw->tw_dr);
-
+ refcount_dec(&tw->tw_dr->tw_refcount);
inet_twsk_put(tw);
}
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index 2d42e4c35a20..a1350fc25838 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -71,8 +71,8 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return inet_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
+ return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo,
+ skb, doff, saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
return udp4_lib_lookup(net, saddr, sport, daddr, dport,
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index b2bae0b0e42a..b22b2c745c76 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -79,6 +79,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
+ struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
struct sock *sk;
switch (protocol) {
@@ -92,12 +93,10 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
- ip_hdrlen(skb) +
- __tcp_hdrlen(hp),
- saddr, sport,
- daddr, dport,
- in->ifindex, 0);
+ sk = inet_lookup_listener(net, hinfo, skb,
+ ip_hdrlen(skb) + __tcp_hdrlen(hp),
+ saddr, sport, daddr, dport,
+ in->ifindex, 0);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
@@ -108,9 +107,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
- sk = inet_lookup_established(net, &tcp_hashinfo,
- saddr, sport, daddr, dport,
- in->ifindex);
+ sk = inet_lookup_established(net, hinfo, saddr, sport,
+ daddr, dport, in->ifindex);
break;
default:
BUG();
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 0088a4c64d77..5386f460bd20 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,7 +59,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
- refcount_read(&net->ipv4.tcp_death_row->tw_refcount) - 1,
+ refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1,
sockets, proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 5490c285668b..9b8a6db7a66b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -39,6 +39,7 @@ static u32 u32_max_div_HZ = UINT_MAX / HZ;
static int one_day_secs = 24 * 3600;
static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
+static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -382,6 +383,29 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl,
return ret;
}
+static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_child_ehash_entries);
+ struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
+ int tcp_ehash_entries;
+ struct ctl_table tbl;
+
+ tcp_ehash_entries = hinfo->ehash_mask + 1;
+
+ /* A negative number indicates that the child netns
+ * shares the global ehash.
+ */
+ if (!net_eq(net, &init_net) && !hinfo->pernet)
+ tcp_ehash_entries *= -1;
+
+ tbl.data = &tcp_ehash_entries;
+ tbl.maxlen = sizeof(int);
+
+ return proc_dointvec(&tbl, write, buffer, lenp, ppos);
+}
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
void *buffer, size_t *lenp,
@@ -530,10 +554,9 @@ static struct ctl_table ipv4_table[] = {
};
static struct ctl_table ipv4_net_table[] = {
- /* tcp_max_tw_buckets must be first in this table. */
{
.procname = "tcp_max_tw_buckets",
-/* .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, */
+ .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
@@ -1322,6 +1345,21 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = SYSCTL_ONE,
},
{
+ .procname = "tcp_ehash_entries",
+ .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries,
+ .mode = 0444,
+ .proc_handler = proc_tcp_ehash_entries,
+ },
+ {
+ .procname = "tcp_child_ehash_entries",
+ .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_child_ehash_entries_max,
+ },
+ {
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
.maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
@@ -1361,8 +1399,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!table)
goto err_alloc;
- /* skip first entry (sysctl_max_tw_buckets) */
- for (i = 1; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
+ for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
if (table[i].data) {
/* Update the variables to point into
* the current struct net
@@ -1377,8 +1414,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
}
}
- table[0].data = &net->ipv4.tcp_death_row->sysctl_max_tw_buckets;
-
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (!net->ipv4.ipv4_hdr)
goto err_reg;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8230be00ecca..829beee3fa32 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4790,6 +4790,7 @@ void __init tcp_init(void)
INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
}
+ tcp_hashinfo.pernet = false;
cnt = tcp_hashinfo.ehash_mask + 1;
sysctl_tcp_max_orphans = cnt / 2;
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 75a1c985f49a..01b50fa79189 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -181,13 +181,21 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
- inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r);
+ struct inet_hashinfo *hinfo;
+
+ hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo;
+
+ inet_diag_dump_icsk(hinfo, skb, cb, r);
}
static int tcp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return inet_diag_dump_one_icsk(&tcp_hashinfo, cb, req);
+ struct inet_hashinfo *hinfo;
+
+ hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo;
+
+ return inet_diag_dump_one_icsk(hinfo, cb, req);
}
#ifdef CONFIG_INET_DIAG_DESTROY
@@ -195,9 +203,13 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req)
{
struct net *net = sock_net(in_skb->sk);
- struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req);
+ struct inet_hashinfo *hinfo;
+ struct sock *sk;
int err;
+ hinfo = net->ipv4.tcp_death_row.hashinfo;
+ sk = inet_diag_find_one_icsk(net, hinfo, req);
+
if (IS_ERR(sk))
return PTR_ERR(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 01b31f5c7aba..6376ad915765 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -201,15 +201,16 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ struct inet_timewait_death_row *tcp_death_row;
__be32 daddr, nexthop, prev_sk_rcv_saddr;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct ip_options_rcu *inet_opt;
+ struct net *net = sock_net(sk);
__be16 orig_sport, orig_dport;
struct flowi4 *fl4;
struct rtable *rt;
int err;
- struct ip_options_rcu *inet_opt;
- struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
@@ -235,7 +236,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return err;
}
@@ -247,11 +248,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (!inet_opt || !inet_opt->opt.srr)
daddr = fl4->daddr;
+ tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+
if (!inet->inet_saddr) {
if (inet_csk(sk)->icsk_bind2_hash) {
- prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo,
- sk, sock_net(sk),
- inet->inet_num);
+ prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
+ sk, net, inet->inet_num);
prev_sk_rcv_saddr = sk->sk_rcv_saddr;
}
inet->inet_saddr = fl4->saddr;
@@ -317,8 +319,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr,
inet->inet_sport,
usin->sin_port));
- tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
- inet->inet_saddr,
+ tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
inet->inet_daddr);
}
@@ -494,9 +495,9 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
int err;
struct net *net = dev_net(skb->dev);
- sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
- th->dest, iph->saddr, ntohs(th->source),
- inet_iif(skb), 0);
+ sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
+ iph->daddr, th->dest, iph->saddr,
+ ntohs(th->source), inet_iif(skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return -ENOENT;
@@ -759,8 +760,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
- ip_hdr(skb)->saddr,
+ sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
+ NULL, 0, ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
ntohs(th->source), dif, sdif);
/* don't send rst if it can't find key */
@@ -1728,6 +1729,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv);
int tcp_v4_early_demux(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
@@ -1744,7 +1746,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
if (th->doff < sizeof(struct tcphdr) / 4)
return 0;
- sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
+ sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
skb->skb_iif, inet_sdif(skb));
@@ -1970,7 +1972,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
lookup:
- sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
+ sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
+ skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -2152,9 +2155,9 @@ do_time_wait:
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
- struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
- &tcp_hashinfo, skb,
- __tcp_hdrlen(th),
+ struct sock *sk2 = inet_lookup_listener(net,
+ net->ipv4.tcp_death_row.hashinfo,
+ skb, __tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb),
@@ -2304,15 +2307,16 @@ static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
*/
static void *listening_get_first(struct seq_file *seq)
{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct tcp_iter_state *st = seq->private;
st->offset = 0;
- for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
+ for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
struct inet_listen_hashbucket *ilb2;
struct hlist_nulls_node *node;
struct sock *sk;
- ilb2 = &tcp_hashinfo.lhash2[st->bucket];
+ ilb2 = &hinfo->lhash2[st->bucket];
if (hlist_nulls_empty(&ilb2->nulls_head))
continue;
@@ -2337,6 +2341,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
struct tcp_iter_state *st = seq->private;
struct inet_listen_hashbucket *ilb2;
struct hlist_nulls_node *node;
+ struct inet_hashinfo *hinfo;
struct sock *sk = cur;
++st->num;
@@ -2348,7 +2353,8 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
return sk;
}
- ilb2 = &tcp_hashinfo.lhash2[st->bucket];
+ hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ ilb2 = &hinfo->lhash2[st->bucket];
spin_unlock(&ilb2->lock);
++st->bucket;
return listening_get_first(seq);
@@ -2370,9 +2376,10 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
return rc;
}
-static inline bool empty_bucket(const struct tcp_iter_state *st)
+static inline bool empty_bucket(struct inet_hashinfo *hinfo,
+ const struct tcp_iter_state *st)
{
- return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
+ return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
}
/*
@@ -2381,20 +2388,21 @@ static inline bool empty_bucket(const struct tcp_iter_state *st)
*/
static void *established_get_first(struct seq_file *seq)
{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct tcp_iter_state *st = seq->private;
st->offset = 0;
- for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
+ for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
struct sock *sk;
struct hlist_nulls_node *node;
- spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
+ spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
/* Lockless fast path for the common case of empty buckets */
- if (empty_bucket(st))
+ if (empty_bucket(hinfo, st))
continue;
spin_lock_bh(lock);
- sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
+ sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
if (seq_sk_match(seq, sk))
return sk;
}
@@ -2406,9 +2414,10 @@ static void *established_get_first(struct seq_file *seq)
static void *established_get_next(struct seq_file *seq, void *cur)
{
- struct sock *sk = cur;
- struct hlist_nulls_node *node;
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct tcp_iter_state *st = seq->private;
+ struct hlist_nulls_node *node;
+ struct sock *sk = cur;
++st->num;
++st->offset;
@@ -2420,7 +2429,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
return sk;
}
- spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
++st->bucket;
return established_get_first(seq);
}
@@ -2458,6 +2467,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
static void *tcp_seek_last_pos(struct seq_file *seq)
{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct tcp_iter_state *st = seq->private;
int bucket = st->bucket;
int offset = st->offset;
@@ -2466,7 +2476,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
- if (st->bucket > tcp_hashinfo.lhash2_mask)
+ if (st->bucket > hinfo->lhash2_mask)
break;
st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_first(seq);
@@ -2478,7 +2488,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
st->state = TCP_SEQ_STATE_ESTABLISHED;
fallthrough;
case TCP_SEQ_STATE_ESTABLISHED:
- if (st->bucket > tcp_hashinfo.ehash_mask)
+ if (st->bucket > hinfo->ehash_mask)
break;
rc = established_get_first(seq);
while (offset-- && rc && bucket == st->bucket)
@@ -2546,16 +2556,17 @@ EXPORT_SYMBOL(tcp_seq_next);
void tcp_seq_stop(struct seq_file *seq, void *v)
{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct tcp_iter_state *st = seq->private;
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
break;
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
- spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
break;
}
}
@@ -2750,6 +2761,7 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
struct sock *start_sk)
{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct bpf_tcp_iter_state *iter = seq->private;
struct tcp_iter_state *st = &iter->state;
struct hlist_nulls_node *node;
@@ -2769,7 +2781,7 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
expected++;
}
}
- spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
return expected;
}
@@ -2777,6 +2789,7 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
struct sock *start_sk)
{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct bpf_tcp_iter_state *iter = seq->private;
struct tcp_iter_state *st = &iter->state;
struct hlist_nulls_node *node;
@@ -2796,13 +2809,14 @@ static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
expected++;
}
}
- spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
return expected;
}
static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct bpf_tcp_iter_state *iter = seq->private;
struct tcp_iter_state *st = &iter->state;
unsigned int expected;
@@ -2818,7 +2832,7 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
st->offset = 0;
st->bucket++;
if (st->state == TCP_SEQ_STATE_LISTENING &&
- st->bucket > tcp_hashinfo.lhash2_mask) {
+ st->bucket > hinfo->lhash2_mask) {
st->state = TCP_SEQ_STATE_ESTABLISHED;
st->bucket = 0;
}
@@ -3083,7 +3097,7 @@ struct proto tcp_prot = {
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
- .h.hashinfo = &tcp_hashinfo,
+ .h.hashinfo = NULL,
.no_autobind = true,
.diag_destroy = tcp_abort,
};
@@ -3091,19 +3105,43 @@ EXPORT_SYMBOL(tcp_prot);
static void __net_exit tcp_sk_exit(struct net *net)
{
- struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
-
if (net->ipv4.tcp_congestion_control)
bpf_module_put(net->ipv4.tcp_congestion_control,
net->ipv4.tcp_congestion_control->owner);
- if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
- kfree(tcp_death_row);
}
-static int __net_init tcp_sk_init(struct net *net)
+static void __net_init tcp_set_hashinfo(struct net *net)
{
- int cnt;
+ struct inet_hashinfo *hinfo;
+ unsigned int ehash_entries;
+ struct net *old_net;
+ if (net_eq(net, &init_net))
+ goto fallback;
+
+ old_net = current->nsproxy->net_ns;
+ ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
+ if (!ehash_entries)
+ goto fallback;
+
+ ehash_entries = roundup_pow_of_two(ehash_entries);
+ hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
+ if (!hinfo) {
+ pr_warn("Failed to allocate TCP ehash (entries: %u) "
+ "for a netns, fallback to the global one\n",
+ ehash_entries);
+fallback:
+ hinfo = &tcp_hashinfo;
+ ehash_entries = tcp_hashinfo.ehash_mask + 1;
+ }
+
+ net->ipv4.tcp_death_row.hashinfo = hinfo;
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
+ net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
+}
+
+static int __net_init tcp_sk_init(struct net *net)
+{
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
@@ -3129,15 +3167,9 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_tw_reuse = 2;
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
- net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
- if (!net->ipv4.tcp_death_row)
- return -ENOMEM;
- refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
- cnt = tcp_hashinfo.ehash_mask + 1;
- net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
- net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
+ refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
+ tcp_set_hashinfo(net);
- net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
net->ipv4.sysctl_tcp_sack = 1;
net->ipv4.sysctl_tcp_window_scaling = 1;
net->ipv4.sysctl_tcp_timestamps = 1;
@@ -3199,10 +3231,13 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
struct net *net;
- inet_twsk_purge(&tcp_hashinfo, AF_INET);
+ tcp_twsk_purge(net_exit_list, AF_INET);
- list_for_each_entry(net, net_exit_list, exit_list)
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
+ WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
tcp_fastopen_ctx_destroy(net);
+ }
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb95d88497ae..442838ab0253 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -247,10 +247,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct inet_timewait_sock *tw;
- struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
- tw = inet_twsk_alloc(sk, tcp_death_row, state);
+ tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state);
if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
@@ -319,14 +319,14 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
/* Linkage updates.
* Note that access to tw after this point is illegal.
*/
- inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
+ inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo);
local_bh_enable();
} else {
/* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
+ NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW);
}
tcp_update_metrics(sk);
@@ -347,6 +347,26 @@ void tcp_twsk_destructor(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+void tcp_twsk_purge(struct list_head *net_exit_list, int family)
+{
+ bool purged_once = false;
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ /* The last refcount is decremented in tcp_sk_exit_batch() */
+ if (refcount_read(&net->ipv4.tcp_death_row.tw_refcount) == 1)
+ continue;
+
+ if (net->ipv4.tcp_death_row.hashinfo->pernet) {
+ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family);
+ } else if (!purged_once) {
+ inet_twsk_purge(&tcp_hashinfo, family);
+ purged_once = true;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(tcp_twsk_purge);
+
/* Warning : This function is called without sk_listener being locked.
* Be sure to read socket fields once, as their value could change under us.
*/
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 8220923a12f7..b10f9183801d 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -151,6 +151,7 @@ static void esp_free_tcp_sk(struct rcu_head *head)
static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
{
struct xfrm_encap_tmpl *encap = x->encap;
+ struct net *net = xs_net(x);
struct esp_tcp_sk *esk;
__be16 sport, dport;
struct sock *nsk;
@@ -177,7 +178,7 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
}
spin_unlock_bh(&x->lock);
- sk = __inet6_lookup_established(xs_net(x), &tcp_hashinfo, &x->id.daddr.in6,
+ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6,
dport, &x->props.saddr.in6, ntohs(sport), 0, 0);
if (!sk)
return ERR_PTR(-ENOENT);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 7d53d62783b1..b64b49012655 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -21,8 +21,6 @@
#include <net/ip.h>
#include <net/sock_reuseport.h>
-extern struct inet_hashinfo tcp_hashinfo;
-
u32 inet6_ehashfn(const struct net *net,
const struct in6_addr *laddr, const u16 lport,
const struct in6_addr *faddr, const __be16 fport)
@@ -169,7 +167,7 @@ static inline struct sock *inet6_lookup_run_bpf(struct net *net,
struct sock *sk, *reuse_sk;
bool no_reuseport;
- if (hashinfo != &tcp_hashinfo)
+ if (hashinfo != net->ipv4.tcp_death_row.hashinfo)
return NULL; /* only TCP is supported */
no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP, saddr, sport,
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
index aa5bb8789ba0..a7690ec62325 100644
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -83,8 +83,8 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return inet6_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
+ return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo,
+ skb, doff, saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
return udp6_lib_lookup(net, saddr, sport, daddr, dport,
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index 6bac68fb27a3..929502e51203 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -80,6 +80,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
+ struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
struct sock *sk;
switch (protocol) {
@@ -93,7 +94,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
+ sk = inet6_lookup_listener(net, hinfo, skb,
thoff + __tcp_hdrlen(hp),
saddr, sport,
daddr, ntohs(dport),
@@ -108,9 +109,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
- sk = __inet6_lookup_established(net, &tcp_hashinfo,
- saddr, sport, daddr, ntohs(dport),
- in->ifindex, 0);
+ sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr,
+ ntohs(dport), in->ifindex, 0);
break;
default:
BUG();
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 35013497e407..06beed4e23bf 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -146,15 +146,16 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
- struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct in6_addr *saddr = NULL, *final_p, final;
struct inet_timewait_death_row *tcp_death_row;
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct in6_addr *saddr = NULL, *final_p, final;
+ struct net *net = sock_net(sk);
struct ipv6_txoptions *opt;
- struct flowi6 fl6;
struct dst_entry *dst;
+ struct flowi6 fl6;
int addr_type;
int err;
@@ -280,20 +281,21 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
- dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto failure;
}
+ tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+
if (!saddr) {
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
struct in6_addr prev_v6_rcv_saddr;
if (icsk->icsk_bind2_hash) {
- prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo,
- sk, sock_net(sk),
- inet->inet_num);
+ prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
+ sk, net, inet->inet_num);
prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
}
saddr = &fl6.saddr;
@@ -325,7 +327,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_dport = usin->sin6_port;
tcp_set_state(sk, TCP_SYN_SENT);
- tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
err = inet6_hash_connect(tcp_death_row, sk);
if (err)
goto late_failure;
@@ -339,8 +340,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
sk->sk_v6_daddr.s6_addr32,
inet->inet_sport,
inet->inet_dport));
- tp->tsoffset = secure_tcpv6_ts_off(sock_net(sk),
- np->saddr.s6_addr32,
+ tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32,
sk->sk_v6_daddr.s6_addr32);
}
@@ -403,7 +403,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
bool fatal;
int err;
- sk = __inet6_lookup_established(net, &tcp_hashinfo,
+ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
&hdr->daddr, th->dest,
&hdr->saddr, ntohs(th->source),
skb->dev->ifindex, inet6_sdif(skb));
@@ -1036,11 +1036,10 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = inet6_lookup_listener(net,
- &tcp_hashinfo, NULL, 0,
- &ipv6h->saddr,
- th->source, &ipv6h->daddr,
- ntohs(th->source), dif, sdif);
+ sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
+ NULL, 0, &ipv6h->saddr, th->source,
+ &ipv6h->daddr, ntohs(th->source),
+ dif, sdif);
if (!sk1)
goto out;
@@ -1638,7 +1637,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
hdr = ipv6_hdr(skb);
lookup:
- sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
+ sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th),
th->source, th->dest, inet6_iif(skb), sdif,
&refcounted);
if (!sk)
@@ -1813,7 +1812,7 @@ do_time_wait:
{
struct sock *sk2;
- sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
+ sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
skb, __tcp_hdrlen(th),
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr,
@@ -1846,6 +1845,7 @@ do_time_wait:
void tcp_v6_early_demux(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
const struct ipv6hdr *hdr;
const struct tcphdr *th;
struct sock *sk;
@@ -1863,7 +1863,7 @@ void tcp_v6_early_demux(struct sk_buff *skb)
return;
/* Note : We use inet6_iif() here, not tcp_v6_iif() */
- sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
+ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
&hdr->saddr, th->source,
&hdr->daddr, ntohs(th->dest),
inet6_iif(skb), inet6_sdif(skb));
@@ -2195,7 +2195,7 @@ struct proto tcpv6_prot = {
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
- .h.hashinfo = &tcp_hashinfo,
+ .h.hashinfo = NULL,
.no_autobind = true,
.diag_destroy = tcp_abort,
};
@@ -2229,7 +2229,7 @@ static void __net_exit tcpv6_net_exit(struct net *net)
static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
{
- inet_twsk_purge(&tcp_hashinfo, AF_INET6);
+ tcp_twsk_purge(net_exit_list, AF_INET6);
}
static struct pernet_operations tcpv6_net_ops = {
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index 7f9a71780437..8df1bdb647e2 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -81,15 +81,18 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba
struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
struct nlattr *bc = cb_data->inet_diag_nla_bc;
struct net *net = sock_net(skb->sk);
+ struct inet_hashinfo *hinfo;
int i;
- for (i = diag_ctx->l_slot; i <= tcp_hashinfo.lhash2_mask; i++) {
+ hinfo = net->ipv4.tcp_death_row.hashinfo;
+
+ for (i = diag_ctx->l_slot; i <= hinfo->lhash2_mask; i++) {
struct inet_listen_hashbucket *ilb;
struct hlist_nulls_node *node;
struct sock *sk;
int num = 0;
- ilb = &tcp_hashinfo.lhash2[i];
+ ilb = &hinfo->lhash2[i];
rcu_read_lock();
spin_lock(&ilb->lock);