summaryrefslogtreecommitdiff
path: root/net/netfilter
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-05-09 15:02:58 -0400
committerDavid S. Miller <davem@davemloft.net>2016-05-09 15:02:58 -0400
commite8ed77dfa90dd79c5343415a4bbbfdab9787b35a (patch)
tree04ce7f294e9a11c1addf1e19662f7c30d7da90bf /net/netfilter
parente26522cd0b63fdbf3b4e9a39d73a985cc9b4fe27 (diff)
parent0c5366b3a8c77fd6d67b763c5a76dfdc314e7726 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
Pablo Neira Ayuso says: ==================== Netfilter updates for net-next The following large patchset contains Netfilter updates for your net-next tree. My initial intention was to send you this in two goes but when I looked back twice I already had this burden on top of me. Several updates for IPVS from Marco Angaroni: 1) Allow SIP connections originating from real-servers to be load balanced by the SIP persistence engine as is already implemented in the other direction. 2) Release connections immediately for One-packet-scheduling (OPS) in IPVS, instead of making it via timer and rcu callback. 3) Skip deleting conntracks for each one packet in OPS, and don't call nf_conntrack_alter_reply() since no reply is expected. 4) Enable drop on exhaustion for OPS + SIP persistence. Miscelaneous conntrack updates from Florian Westphal, including fix for hash resize: 5) Move conntrack generation counter out of conntrack pernet structure since this is only used by the init_ns to allow hash resizing. 6) Use get_random_once() from packet path to collect hash random seed instead of our compound. 7) Don't disable BH from ____nf_conntrack_find() for statistics, use NF_CT_STAT_INC_ATOMIC() instead. 8) Fix lookup race during conntrack hash resizing. 9) Introduce clash resolution on conntrack insertion for connectionless protocol. Then, Florian's netns rework to get rid of per-netns conntrack table, thus we use one single table for them all. There was consensus on this change during the NFWS 2015 and, on top of that, it has recently been pointed as a source of multiple problems from unpriviledged netns: 11) Use a single conntrack hashtable for all namespaces. Include netns in object comparisons and make it part of the hash calculation. Adapt early_drop() to consider netns. 12) Use single expectation and NAT hashtable for all namespaces. 13) Use a single slab cache for all namespaces for conntrack objects. 14) Skip full table scanning from nf_ct_iterate_cleanup() if the pernet conntrack counter tells us the table is empty (ie. equals zero). Fixes for nf_tables interval set element handling, support to set conntrack connlabels and allow set names up to 32 bytes. 15) Parse element flags from element deletion path and pass it up to the backend set implementation. 16) Allow adjacent intervals in the rbtree set type for dynamic interval updates. 17) Add support to set connlabel from nf_tables, from Florian Westphal. 18) Allow set names up to 32 bytes in nf_tables. Several x_tables fixes and updates: 19) Fix incorrect use of IS_ERR_VALUE() in x_tables, original patch from Andrzej Hajda. And finally, miscelaneous netfilter updates such as: 20) Disable automatic helper assignment by default. Note this proc knob was introduced by a9006892643a ("netfilter: nf_ct_helper: allow to disable automatic helper assignment") 4 years ago to start moving towards explicit conntrack helper configuration via iptables CT target. 21) Get rid of obsolete and inconsistent debugging instrumentation in x_tables. 22) Remove unnecessary check for null after ip6_route_output(). ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c51
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c162
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c46
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c15
-rw-r--r--net/netfilter/nf_conntrack_core.c415
-rw-r--r--net/netfilter/nf_conntrack_expect.c83
-rw-r--r--net/netfilter/nf_conntrack_helper.c12
-rw-r--r--net/netfilter/nf_conntrack_netlink.c29
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c2
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c2
-rw-r--r--net/netfilter/nf_conntrack_standalone.c13
-rw-r--r--net/netfilter/nf_nat_core.c39
-rw-r--r--net/netfilter/nf_tables_api.c78
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c6
-rw-r--r--net/netfilter/nft_ct.c30
-rw-r--r--net/netfilter/nft_rbtree.c49
17 files changed, 726 insertions, 310 deletions
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 85ca189bdc3d..2cb3c626cd43 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key)
spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
+static void ip_vs_conn_expire(unsigned long data);
/*
* Returns hash value for IPVS connection entry
@@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
}
EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
+static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp)
+{
+ __ip_vs_conn_put(cp);
+ ip_vs_conn_expire((unsigned long)cp);
+}
+
/*
* Put back the conn and restart its timer with its timeout
*/
-void ip_vs_conn_put(struct ip_vs_conn *cp)
+static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp)
{
unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
0 : cp->timeout;
@@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
__ip_vs_conn_put(cp);
}
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+ if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) &&
+ (atomic_read(&cp->refcnt) == 1) &&
+ !timer_pending(&cp->timer))
+ /* expire connection immediately */
+ __ip_vs_conn_put_notimer(cp);
+ else
+ __ip_vs_conn_put_timer(cp);
+}
/*
* Fill a no_client_port connection with a client port number
@@ -819,7 +836,8 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->control)
ip_vs_control_del(cp);
- if (cp->flags & IP_VS_CONN_F_NFCT) {
+ if ((cp->flags & IP_VS_CONN_F_NFCT) &&
+ !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
/* Do not access conntracks during subsys cleanup
* because nf_conntrack_find_get can not be used after
* conntrack cleanup for the net.
@@ -834,7 +852,10 @@ static void ip_vs_conn_expire(unsigned long data)
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
- call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ ip_vs_conn_rcu_free(&cp->rcu_head);
+ else
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
atomic_dec(&ipvs->conn_count);
return;
}
@@ -850,7 +871,7 @@ static void ip_vs_conn_expire(unsigned long data)
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
- ip_vs_conn_put(cp);
+ __ip_vs_conn_put_timer(cp);
}
/* Modify timer, so that it expires as soon as possible.
@@ -1240,6 +1261,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
return 1;
}
+static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
+{
+ struct ip_vs_service *svc;
+
+ if (!cp->dest)
+ return false;
+ svc = rcu_dereference(cp->dest->svc);
+ return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
+}
+
/* Called from keventd and must protect itself from softirqs */
void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
{
@@ -1254,11 +1285,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- /* connection template */
- continue;
if (cp->ipvs != ipvs)
continue;
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+ if (atomic_read(&cp->n_control) ||
+ !ip_vs_conn_ops_mode(cp))
+ continue;
+ else
+ /* connection template of OPS */
+ goto try_drop;
+ }
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
case IP_VS_TCP_S_SYN_RECV:
@@ -1286,6 +1322,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
continue;
}
} else {
+try_drop:
if (!todrop_entry(cp))
continue;
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b9a4082afa3a..1207f20d24e4 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put);
#ifdef CONFIG_IP_VS_DEBUG
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
+EXPORT_SYMBOL(ip_vs_new_conn_out);
static int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */
@@ -611,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ret = cp->packet_xmit(skb, cp, pd->pp, iph);
/* do not touch skb anymore */
- atomic_inc(&cp->in_pkts);
+ if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
+ atomic_inc(&cp->control->in_pkts);
+ else
+ atomic_inc(&cp->in_pkts);
ip_vs_conn_put(cp);
return ret;
}
@@ -1100,6 +1104,143 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
}
}
+/* Generic function to create new connections for outgoing RS packets
+ *
+ * Pre-requisites for successful connection creation:
+ * 1) Virtual Service is NOT fwmark based:
+ * In fwmark-VS actual vaddr and vport are unknown to IPVS
+ * 2) Real Server and Virtual Service were NOT configured without port:
+ * This is to allow match of different VS to the same RS ip-addr
+ */
+struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ __be16 dport,
+ __be16 cport)
+{
+ struct ip_vs_conn_param param;
+ struct ip_vs_conn *ct = NULL, *cp = NULL;
+ const union nf_inet_addr *vaddr, *daddr, *caddr;
+ union nf_inet_addr snet;
+ __be16 vport;
+ unsigned int flags;
+
+ EnterFunction(12);
+ vaddr = &svc->addr;
+ vport = svc->port;
+ daddr = &iph->saddr;
+ caddr = &iph->daddr;
+
+ /* check pre-requisites are satisfied */
+ if (svc->fwmark)
+ return NULL;
+ if (!vport || !dport)
+ return NULL;
+
+ /* for persistent service first create connection template */
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+ /* apply netmask the same way ingress-side does */
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ ipv6_addr_prefix(&snet.in6, &caddr->in6,
+ (__force __u32)svc->netmask);
+ else
+#endif
+ snet.ip = caddr->ip & svc->netmask;
+ /* fill params and create template if not existent */
+ if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol,
+ &snet, 0, vaddr,
+ vport, &param) < 0)
+ return NULL;
+ ct = ip_vs_ct_in_get(&param);
+ if (!ct) {
+ ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
+ IP_VS_CONN_F_TEMPLATE, dest, 0);
+ if (!ct) {
+ kfree(param.pe_data);
+ return NULL;
+ }
+ ct->timeout = svc->timeout;
+ } else {
+ kfree(param.pe_data);
+ }
+ }
+
+ /* connection flags */
+ flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) &&
+ iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0;
+ /* create connection */
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
+ caddr, cport, vaddr, vport, &param);
+ cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0);
+ if (!cp) {
+ if (ct)
+ ip_vs_conn_put(ct);
+ return NULL;
+ }
+ if (ct) {
+ ip_vs_control_add(cp, ct);
+ ip_vs_conn_put(ct);
+ }
+ ip_vs_conn_stats(cp, svc);
+
+ /* return connection (will be used to handle outgoing packet) */
+ IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u "
+ "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
+ ip_vs_fwd_tag(cp),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+ cp->flags, atomic_read(&cp->refcnt));
+ LeaveFunction(12);
+ return cp;
+}
+
+/* Handle outgoing packets which are considered requests initiated by
+ * real servers, so that subsequent responses from external client can be
+ * routed to the right real server.
+ * Used also for outgoing responses in OPS mode.
+ *
+ * Connection management is handled by persistent-engine specific callback.
+ */
+static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum,
+ struct netns_ipvs *ipvs,
+ int af, struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_conn *cp = NULL;
+ __be16 _ports[2], *pptr;
+
+ if (hooknum == NF_INET_LOCAL_IN)
+ return NULL;
+
+ pptr = frag_safe_skb_hp(skb, iph->len,
+ sizeof(_ports), _ports, iph);
+ if (!pptr)
+ return NULL;
+
+ rcu_read_lock();
+ dest = ip_vs_find_real_service(ipvs, af, iph->protocol,
+ &iph->saddr, pptr[0]);
+ if (dest) {
+ struct ip_vs_service *svc;
+ struct ip_vs_pe *pe;
+
+ svc = rcu_dereference(dest->svc);
+ if (svc) {
+ pe = rcu_dereference(svc->pe);
+ if (pe && pe->conn_out)
+ cp = pe->conn_out(svc, dest, skb, iph,
+ pptr[0], pptr[1]);
+ }
+ }
+ rcu_read_unlock();
+
+ return cp;
+}
+
/* Handle response packets: rewrite addresses and send away...
*/
static unsigned int
@@ -1245,6 +1386,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
+
+ /* Check for real-server-started requests */
+ if (atomic_read(&ipvs->conn_out_counter)) {
+ /* Currently only for UDP:
+ * connection oriented protocols typically use
+ * ephemeral ports for outgoing connections, so
+ * related incoming responses would not match any VS
+ */
+ if (pp->protocol == IPPROTO_UDP) {
+ cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
+ if (likely(cp))
+ return handle_response(af, skb, pd, cp, &iph,
+ hooknum);
+ }
+ }
+
if (sysctl_nat_icmp_send(ipvs) &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
@@ -1837,6 +1994,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, pkts);
+ else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
+ /* increment is done inside ip_vs_sync_conn too */
+ atomic_inc(&cp->control->in_pkts);
ip_vs_conn_put(cp);
return ret;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index f35ebc02fa5c..c3c809b2e712 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
return false;
}
+/* Find real service record by <proto,addr,port>.
+ * In case of multiple records with the same <proto,addr,port>, only
+ * the first found record is returned.
+ *
+ * To be called under RCU lock.
+ */
+struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
+ __u16 protocol,
+ const union nf_inet_addr *daddr,
+ __be16 dport)
+{
+ unsigned int hash;
+ struct ip_vs_dest *dest;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_rs_hashkey(af, daddr, dport);
+
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->port == dport &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ (dest->protocol == protocol || dest->vfwmark)) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
/* Lookup destination by {addr,port} in the given service
* Called under RCU lock.
*/
@@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
+ if (svc->pe && svc->pe->conn_out)
+ atomic_inc(&ipvs->conn_out_counter);
ip_vs_start_estimator(ipvs, &svc->stats);
@@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
struct ip_vs_scheduler *sched = NULL, *old_sched;
struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0;
+ bool new_pe_conn_out, old_pe_conn_out;
/*
* Lookup the scheduler, by 'u->sched_name'
@@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
svc->netmask = u->netmask;
old_pe = rcu_dereference_protected(svc->pe, 1);
- if (pe != old_pe)
+ if (pe != old_pe) {
rcu_assign_pointer(svc->pe, pe);
+ /* check for optional methods in new pe */
+ new_pe_conn_out = (pe && pe->conn_out) ? true : false;
+ old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
+ if (new_pe_conn_out && !old_pe_conn_out)
+ atomic_inc(&svc->ipvs->conn_out_counter);
+ if (old_pe_conn_out && !new_pe_conn_out)
+ atomic_dec(&svc->ipvs->conn_out_counter);
+ }
out:
ip_vs_scheduler_put(old_sched);
@@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/* Unbind persistence engine, keep svc->pe */
old_pe = rcu_dereference_protected(svc->pe, 1);
+ if (old_pe && old_pe->conn_out)
+ atomic_dec(&ipvs->conn_out_counter);
ip_vs_pe_put(old_pe);
/*
@@ -3969,6 +4012,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
(unsigned long) ipvs);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
+ atomic_set(&ipvs->conn_out_counter, 0);
/* procfs stats */
ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 30434fb133df..f04fd8df210b 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
return;
+ /* Never alter conntrack for OPS conns (no reply is expected) */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return;
+
/* Alter reply only in original direction */
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return;
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 0a6eb5c0d9e9..d07ef9e31c12 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
return cp->pe_data_len;
}
+static struct ip_vs_conn *
+ip_vs_sip_conn_out(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ __be16 dport,
+ __be16 cport)
+{
+ if (likely(iph->protocol == IPPROTO_UDP))
+ return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport);
+ /* currently no need to handle other than UDP */
+ return NULL;
+}
+
static struct ip_vs_pe ip_vs_sip_pe =
{
.name = "sip",
@@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe =
.ct_match = ip_vs_sip_ct_match,
.hashkey_raw = ip_vs_sip_hashkey_raw,
.show_pe_data = ip_vs_sip_show_pe_data,
+ .conn_out = ip_vs_sip_conn_out,
};
static int __init ip_vs_sip_init(void)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 2fd607408998..0cd29365004f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -54,6 +54,7 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_helper.h>
+#include <net/netns/hash.h>
#define NF_CONNTRACK_VERSION "0.5.0"
@@ -68,7 +69,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
+struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_hash);
+
+static __read_mostly struct kmem_cache *nf_conntrack_cachep;
static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
+static __read_mostly seqcount_t nf_conntrack_generation;
static __read_mostly bool nf_conntrack_locks_all;
void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
@@ -107,7 +113,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
spin_lock_nested(&nf_conntrack_locks[h1],
SINGLE_DEPTH_NESTING);
}
- if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
nf_conntrack_double_unlock(h1, h2);
return true;
}
@@ -141,43 +147,43 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
-unsigned int nf_conntrack_hash_rnd __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
-static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
+ const struct net *net)
{
unsigned int n;
+ u32 seed;
+
+ get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
/* The direction must be ignored, so we hash everything up to the
* destination ports (which is a multiple of 4) and treat the last
* three bytes manually.
*/
+ seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^
+ return jhash2((u32 *)tuple, n, seed ^
(((__force __u16)tuple->dst.u.all << 16) |
tuple->dst.protonum));
}
-static u32 __hash_bucket(u32 hash, unsigned int size)
-{
- return reciprocal_scale(hash, size);
-}
-
-static u32 hash_bucket(u32 hash, const struct net *net)
+static u32 scale_hash(u32 hash)
{
- return __hash_bucket(hash, net->ct.htable_size);
+ return reciprocal_scale(hash, nf_conntrack_htable_size);
}
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- unsigned int size)
+static u32 __hash_conntrack(const struct net *net,
+ const struct nf_conntrack_tuple *tuple,
+ unsigned int size)
{
- return __hash_bucket(hash_conntrack_raw(tuple), size);
+ return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
}
-static inline u_int32_t hash_conntrack(const struct net *net,
- const struct nf_conntrack_tuple *tuple)
+static u32 hash_conntrack(const struct net *net,
+ const struct nf_conntrack_tuple *tuple)
{
- return __hash_conntrack(tuple, net->ct.htable_size);
+ return scale_hash(hash_conntrack_raw(tuple, net));
}
bool
@@ -358,7 +364,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
}
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
- if (l4proto && l4proto->destroy)
+ if (l4proto->destroy)
l4proto->destroy(ct);
rcu_read_unlock();
@@ -393,7 +399,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
reply_hash = hash_conntrack(net,
@@ -445,7 +451,8 @@ static void death_by_timeout(unsigned long ul_conntrack)
static inline bool
nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const struct nf_conntrack_zone *zone,
+ const struct net *net)
{
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
@@ -454,7 +461,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
*/
return nf_ct_tuple_equal(tuple, &h->tuple) &&
nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
- nf_ct_is_confirmed(ct);
+ nf_ct_is_confirmed(ct) &&
+ net_eq(net, nf_ct_net(ct));
}
/*
@@ -467,21 +475,23 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
- unsigned int bucket = hash_bucket(hash, net);
+ unsigned int bucket, sequence;
- /* Disable BHs the entire time since we normally need to disable them
- * at least once for the stats anyway.
- */
- local_bh_disable();
begin:
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
- if (nf_ct_key_equal(h, tuple, zone)) {
- NF_CT_STAT_INC(net, found);
- local_bh_enable();
+ do {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ bucket = scale_hash(hash);
+ ct_hash = nf_conntrack_hash;
+ } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+ hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
+ if (nf_ct_key_equal(h, tuple, zone, net)) {
+ NF_CT_STAT_INC_ATOMIC(net, found);
return h;
}
- NF_CT_STAT_INC(net, searched);
+ NF_CT_STAT_INC_ATOMIC(net, searched);
}
/*
* if the nulls value we got at the end of this lookup is
@@ -489,10 +499,9 @@ begin:
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(n) != bucket) {
- NF_CT_STAT_INC(net, search_restart);
+ NF_CT_STAT_INC_ATOMIC(net, search_restart);
goto begin;
}
- local_bh_enable();
return NULL;
}
@@ -514,7 +523,7 @@ begin:
!atomic_inc_not_zero(&ct->ct_general.use)))
h = NULL;
else {
- if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {
+ if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
nf_ct_put(ct);
goto begin;
}
@@ -530,7 +539,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
return __nf_conntrack_find_get(net, zone, tuple,
- hash_conntrack_raw(tuple));
+ hash_conntrack_raw(tuple, net));
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
@@ -538,12 +547,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
unsigned int hash,
unsigned int reply_hash)
{
- struct net *net = nf_ct_net(ct);
-
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &net->ct.hash[hash]);
+ &nf_conntrack_hash[hash]);
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
- &net->ct.hash[reply_hash]);
+ &nf_conntrack_hash[reply_hash]);
}
int
@@ -560,7 +567,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
reply_hash = hash_conntrack(net,
@@ -568,17 +575,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* See if there's one in the list already, including reverse */
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
goto out;
add_timer(&ct->timeout);
@@ -599,6 +603,63 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
+static inline void nf_ct_acct_update(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int len)
+{
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
+ }
+}
+
+static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ const struct nf_conn *loser_ct)
+{
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(loser_ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+ enum ip_conntrack_info ctinfo;
+ unsigned int bytes;
+
+ /* u32 should be fine since we must have seen one packet. */
+ bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
+ nf_ct_acct_update(ct, ctinfo, bytes);
+ }
+}
+
+/* Resolve race on insertion if this protocol allows this. */
+static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_tuple_hash *h)
+{
+ /* This is the conntrack entry already in hashes that won race. */
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ struct nf_conntrack_l4proto *l4proto;
+
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto->allow_clash &&
+ !nf_ct_is_dying(ct) &&
+ atomic_inc_not_zero(&ct->ct_general.use)) {
+ nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
+ nf_conntrack_put(skb->nfct);
+ /* Assign conntrack already in hashes to this skbuff. Don't
+ * modify skb->nfctinfo to ensure consistent stateful filtering.
+ */
+ skb->nfct = &ct->ct_general;
+ return NF_ACCEPT;
+ }
+ NF_CT_STAT_INC(net, drop);
+ return NF_DROP;
+}
+
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
@@ -613,6 +674,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
enum ip_conntrack_info ctinfo;
struct net *net;
unsigned int sequence;
+ int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);
@@ -628,10 +690,10 @@ __nf_conntrack_confirm(struct sk_buff *skb)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
/* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
- hash = hash_bucket(hash, net);
+ hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -655,23 +717,22 @@ __nf_conntrack_confirm(struct sk_buff *skb)
*/
nf_ct_del_from_dying_or_unconfirmed_list(ct);
- if (unlikely(nf_ct_is_dying(ct)))
- goto out;
+ if (unlikely(nf_ct_is_dying(ct))) {
+ nf_ct_add_to_dying_list(ct);
+ goto dying;
+ }
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
goto out;
/* Timer relative to confirmation time, not original
@@ -710,10 +771,12 @@ __nf_conntrack_confirm(struct sk_buff *skb)
out:
nf_ct_add_to_dying_list(ct);
+ ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
+dying:
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
- return NF_DROP;
+ return ret;
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
@@ -726,29 +789,31 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
struct net *net = nf_ct_net(ignored_conntrack);
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_head *ct_hash;
+ unsigned int hash, sequence;
struct hlist_nulls_node *n;
struct nf_conn *ct;
- unsigned int hash;
zone = nf_ct_zone(ignored_conntrack);
- hash = hash_conntrack(net, tuple);
- /* Disable BHs the entire time since we need to disable them at
- * least once for the stats anyway.
- */
- rcu_read_lock_bh();
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+ rcu_read_lock();
+ do {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ hash = hash_conntrack(net, tuple);
+ ct_hash = nf_conntrack_hash;
+ } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+ hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (ct != ignored_conntrack &&
- nf_ct_tuple_equal(tuple, &h->tuple) &&
- nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) {
- NF_CT_STAT_INC(net, found);
- rcu_read_unlock_bh();
+ nf_ct_key_equal(h, tuple, zone, net)) {
+ NF_CT_STAT_INC_ATOMIC(net, found);
+ rcu_read_unlock();
return 1;
}
- NF_CT_STAT_INC(net, searched);
+ NF_CT_STAT_INC_ATOMIC(net, searched);
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return 0;
}
@@ -762,71 +827,63 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
{
/* Use oldest entry, which is roughly LRU */
struct nf_conntrack_tuple_hash *h;
- struct nf_conn *ct = NULL, *tmp;
+ struct nf_conn *tmp;
struct hlist_nulls_node *n;
- unsigned int i = 0, cnt = 0;
- int dropped = 0;
- unsigned int hash, sequence;
+ unsigned int i, hash, sequence;
+ struct nf_conn *ct = NULL;
spinlock_t *lockp;
+ bool ret = false;
+
+ i = 0;
local_bh_disable();
restart:
- sequence = read_seqcount_begin(&net->ct.generation);
- hash = hash_bucket(_hash, net);
- for (; i < net->ct.htable_size; i++) {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ for (; i < NF_CT_EVICTION_RANGE; i++) {
+ hash = scale_hash(_hash++);
lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
- if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
spin_unlock(lockp);
goto restart;
}
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
- hnnode) {
+ hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash],
+ hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
- !nf_ct_is_dying(tmp) &&
- atomic_inc_not_zero(&tmp->ct_general.use)) {
+
+ if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
+ !net_eq(nf_ct_net(tmp), net) ||
+ nf_ct_is_dying(tmp))
+ continue;
+
+ if (atomic_inc_not_zero(&tmp->ct_general.use)) {
ct = tmp;
break;
}
- cnt++;
}
- hash = (hash + 1) % net->ct.htable_size;
spin_unlock(lockp);
-
- if (ct || cnt >= NF_CT_EVICTION_RANGE)
+ if (ct)
break;
-
}
+
local_bh_enable();
if (!ct)
- return dropped;
+ return false;
- if (del_timer(&ct->timeout)) {
+ /* kill only if in same netns -- might have moved due to
+ * SLAB_DESTROY_BY_RCU rules
+ */
+ if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) {
if (nf_ct_delete(ct, 0, 0)) {
- dropped = 1;
NF_CT_STAT_INC_ATOMIC(net, early_drop);
+ ret = true;
}
}
- nf_ct_put(ct);
- return dropped;
-}
-
-void init_nf_conntrack_hash_rnd(void)
-{
- unsigned int rand;
- /*
- * Why not initialize nf_conntrack_rnd in a "init()" function ?
- * Because there isn't enough entropy when system initializing,
- * and we initialize it as late as possible.
- */
- do {
- get_random_bytes(&rand, sizeof(rand));
- } while (!rand);
- cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+ nf_ct_put(ct);
+ return ret;
}
static struct nf_conn *
@@ -838,12 +895,6 @@ __nf_conntrack_alloc(struct net *net,
{
struct nf_conn *ct;
- if (unlikely(!nf_conntrack_hash_rnd)) {
- init_nf_conntrack_hash_rnd();
- /* recompute the hash as nf_conntrack_hash_rnd is initialized */
- hash = hash_conntrack_raw(orig);
- }
-
/* We don't want any race condition at early drop stage */
atomic_inc(&net->ct.count);
@@ -860,7 +911,7 @@ __nf_conntrack_alloc(struct net *net,
* Do not use kmem_cache_zalloc(), as this cache uses
* SLAB_DESTROY_BY_RCU.
*/
- ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
+ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
if (ct == NULL)
goto out;
@@ -887,7 +938,7 @@ __nf_conntrack_alloc(struct net *net,
atomic_set(&ct->ct_general.use, 0);
return ct;
out_free:
- kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ kmem_cache_free(nf_conntrack_cachep, ct);
out:
atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
@@ -914,7 +965,7 @@ void nf_conntrack_free(struct nf_conn *ct)
nf_ct_ext_destroy(ct);
nf_ct_ext_free(ct);
- kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ kmem_cache_free(nf_conntrack_cachep, ct);
smp_mb__before_atomic();
atomic_dec(&net->ct.count);
}
@@ -1061,7 +1112,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
/* look for tuple match */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
- hash = hash_conntrack_raw(&tuple);
+ hash = hash_conntrack_raw(&tuple, net);
h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
@@ -1270,17 +1321,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
}
acct:
- if (do_acct) {
- struct nf_conn_acct *acct;
-
- acct = nf_conn_acct_find(ct);
- if (acct) {
- struct nf_conn_counter *counter = acct->counter;
-
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);
- }
- }
+ if (do_acct)
+ nf_ct_acct_update(ct, ctinfo, skb->len);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -1289,18 +1331,8 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
const struct sk_buff *skb,
int do_acct)
{
- if (do_acct) {
- struct nf_conn_acct *acct;
-
- acct = nf_conn_acct_find(ct);
- if (acct) {
- struct nf_conn_counter *counter = acct->counter;
-
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(skb->len - skb_network_offset(skb),
- &counter[CTINFO2DIR(ctinfo)].bytes);
- }
- }
+ if (do_acct)
+ nf_ct_acct_update(ct, ctinfo, skb->len);
if (del_timer(&ct->timeout)) {
ct->timeout.function((unsigned long)ct);
@@ -1396,16 +1428,17 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
int cpu;
spinlock_t *lockp;
- for (; *bucket < net->ct.htable_size; (*bucket)++) {
+ for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
nf_conntrack_lock(lockp);
- if (*bucket < net->ct.htable_size) {
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+ if (*bucket < nf_conntrack_htable_size) {
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (iter(ct, data))
+ if (net_eq(nf_ct_net(ct), net) &&
+ iter(ct, data))
goto found;
}
}
@@ -1443,6 +1476,9 @@ void nf_ct_iterate_cleanup(struct net *net,
might_sleep();
+ if (atomic_read(&net->ct.count) == 0)
+ return;
+
while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
if (del_timer(&ct->timeout))
@@ -1494,6 +1530,8 @@ void nf_conntrack_cleanup_end(void)
while (untrack_refs() > 0)
schedule();
+ nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+
#ifdef CONFIG_NF_CONNTRACK_ZONES
nf_ct_extend_unregister(&nf_ct_zone_extend);
#endif
@@ -1544,15 +1582,12 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
nf_conntrack_proto_pernet_fini(net);
nf_conntrack_helper_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_tstamp_pernet_fini(net);
nf_conntrack_acct_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
- kfree(net->ct.slabname);
free_percpu(net->ct.stat);
free_percpu(net->ct.pcpu_lists);
}
@@ -1607,7 +1642,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
local_bh_disable();
nf_conntrack_all_lock();
- write_seqcount_begin(&init_net.ct.generation);
+ write_seqcount_begin(&nf_conntrack_generation);
/* Lookups in the old hash might happen in parallel, which means we
* might get false negatives during connection lookup. New connections
@@ -1615,26 +1650,28 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
* though since that required taking the locks.
*/
- for (i = 0; i < init_net.ct.htable_size; i++) {
- while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
- h = hlist_nulls_entry(init_net.ct.hash[i].first,
- struct nf_conntrack_tuple_hash, hnnode);
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
+ while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
+ h = hlist_nulls_entry(nf_conntrack_hash[i].first,
+ struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
- bucket = __hash_conntrack(&h->tuple, hashsize);
+ bucket = __hash_conntrack(nf_ct_net(ct),
+ &h->tuple, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
- old_size = init_net.ct.htable_size;
- old_hash = init_net.ct.hash;
+ old_size = nf_conntrack_htable_size;
+ old_hash = nf_conntrack_hash;
- init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
- init_net.ct.hash = hash;
+ nf_conntrack_hash = hash;
+ nf_conntrack_htable_size = hashsize;
- write_seqcount_end(&init_net.ct.generation);
+ write_seqcount_end(&nf_conntrack_generation);
nf_conntrack_all_unlock();
local_bh_enable();
+ synchronize_net();
nf_ct_free_hashtable(old_hash, old_size);
return 0;
}
@@ -1655,7 +1692,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
int nf_conntrack_init_start(void)
{
int max_factor = 8;
- int i, ret, cpu;
+ int ret = -ENOMEM;
+ int i, cpu;
+
+ seqcount_init(&nf_conntrack_generation);
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_conntrack_locks[i]);
@@ -1682,8 +1722,19 @@ int nf_conntrack_init_start(void)
* entries. */
max_factor = 4;
}
+
+ nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
+ if (!nf_conntrack_hash)
+ return -ENOMEM;
+
nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+ nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
+ sizeof(struct nf_conn), 0,
+ SLAB_DESTROY_BY_RCU, NULL);
+ if (!nf_conntrack_cachep)
+ goto err_cachep;
+
printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
nf_conntrack_max);
@@ -1760,6 +1811,9 @@ err_tstamp:
err_acct:
nf_conntrack_expect_fini();
err_expect:
+ kmem_cache_destroy(nf_conntrack_cachep);
+err_cachep:
+ nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
return ret;
}
@@ -1783,7 +1837,6 @@ int nf_conntrack_init_net(struct net *net)
int cpu;
atomic_set(&net->ct.count, 0);
- seqcount_init(&net->ct.generation);
net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
if (!net->ct.pcpu_lists)
@@ -1801,24 +1854,6 @@ int nf_conntrack_init_net(struct net *net)
if (!net->ct.stat)
goto err_pcpu_lists;
- net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
- if (!net->ct.slabname)
- goto err_slabname;
-
- net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
- sizeof(struct nf_conn), 0,
- SLAB_DESTROY_BY_RCU, NULL);
- if (!net->ct.nf_conntrack_cachep) {
- printk(KERN_ERR "Unable to create nf_conn slab cache\n");
- goto err_cache;
- }
-
- net->ct.htable_size = nf_conntrack_htable_size;
- net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
- if (!net->ct.hash) {
- printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
- goto err_hash;
- }
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
goto err_expect;
@@ -1850,12 +1885,6 @@ err_tstamp:
err_acct:
nf_conntrack_expect_pernet_fini(net);
err_expect:
- nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
-err_hash:
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
-err_cache:
- kfree(net->ct.slabname);
-err_slabname:
free_percpu(net->ct.stat);
err_pcpu_lists:
free_percpu(net->ct.pcpu_lists);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 278927ab0948..9e3693128313 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -24,6 +24,7 @@
#include <linux/moduleparam.h>
#include <linux/export.h>
#include <net/net_namespace.h>
+#include <net/netns/hash.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -35,9 +36,13 @@
unsigned int nf_ct_expect_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
+struct hlist_head *nf_ct_expect_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
+
unsigned int nf_ct_expect_max __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
+static unsigned int nf_ct_expect_hashrnd __read_mostly;
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
@@ -72,21 +77,32 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect)
nf_ct_expect_put(exp);
}
-static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
+static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
- unsigned int hash;
+ unsigned int hash, seed;
- if (unlikely(!nf_conntrack_hash_rnd)) {
- init_nf_conntrack_hash_rnd();
- }
+ get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd));
+
+ seed = nf_ct_expect_hashrnd ^ net_hash_mix(n);
hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
(((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
- (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
+ (__force __u16)tuple->dst.u.all) ^ seed);
return reciprocal_scale(hash, nf_ct_expect_hsize);
}
+static bool
+nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_expect *i,
+ const struct nf_conntrack_zone *zone,
+ const struct net *net)
+{
+ return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
+ net_eq(net, nf_ct_net(i->master)) &&
+ nf_ct_zone_equal_any(i->master, zone);
+}
+
struct nf_conntrack_expect *
__nf_ct_expect_find(struct net *net,
const struct nf_conntrack_zone *zone,
@@ -98,10 +114,9 @@ __nf_ct_expect_find(struct net *net,
if (!net->ct.expect_count)
return NULL;
- h = nf_ct_expect_dst_hash(tuple);
- hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
- if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
- nf_ct_zone_equal_any(i->master, zone))
+ h = nf_ct_expect_dst_hash(net, tuple);
+ hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) {
+ if (nf_ct_exp_equal(tuple, i, zone, net))
return i;
}
return NULL;
@@ -139,11 +154,10 @@ nf_ct_find_expectation(struct net *net,
if (!net->ct.expect_count)
return NULL;
- h = nf_ct_expect_dst_hash(tuple);
- hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
+ h = nf_ct_expect_dst_hash(net, tuple);
+ hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) {
if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
- nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
- nf_ct_zone_equal_any(i->master, zone)) {
+ nf_ct_exp_equal(tuple, i, zone, net)) {
exp = i;
break;
}
@@ -223,6 +237,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
}
return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
+ net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
@@ -232,6 +247,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,
return a->master == b->master && a->class == b->class &&
nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
+ net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
@@ -342,7 +358,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
struct nf_conn_help *master_help = nfct_help(exp->master);
struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(exp);
- unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
+ unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple);
/* two references : one for hash insert, one for the timer */
atomic_add(2, &exp->use);
@@ -350,7 +366,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
hlist_add_head(&exp->lnode, &master_help->expectations);
master_help->expecting[exp->class]++;
- hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
+ hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
net->ct.expect_count++;
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
@@ -401,8 +417,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
ret = -ESHUTDOWN;
goto out;
}
- h = nf_ct_expect_dst_hash(&expect->tuple);
- hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {
+ h = nf_ct_expect_dst_hash(net, &expect->tuple);
+ hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) {
if (expect_matches(i, expect)) {
if (del_timer(&i->timeout)) {
nf_ct_unlink_expect(i);
@@ -468,12 +484,11 @@ struct ct_expect_iter_state {
static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
{
- struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
if (n)
return n;
}
@@ -483,14 +498,13 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct hlist_node *head)
{
- struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
}
return head;
}
@@ -623,28 +637,13 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
int nf_conntrack_expect_pernet_init(struct net *net)
{
- int err = -ENOMEM;
-
net->ct.expect_count = 0;
- net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
- if (net->ct.expect_hash == NULL)
- goto err1;
-
- err = exp_proc_init(net);
- if (err < 0)
- goto err2;
-
- return 0;
-err2:
- nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
-err1:
- return err;
+ return exp_proc_init(net);
}
void nf_conntrack_expect_pernet_fini(struct net *net)
{
exp_proc_remove(net);
- nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
}
int nf_conntrack_expect_init(void)
@@ -660,6 +659,13 @@ int nf_conntrack_expect_init(void)
0, 0, NULL);
if (!nf_ct_expect_cachep)
return -ENOMEM;
+
+ nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
+ if (!nf_ct_expect_hash) {
+ kmem_cache_destroy(nf_ct_expect_cachep);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -667,4 +673,5 @@ void nf_conntrack_expect_fini(void)
{
rcu_barrier(); /* Wait for call_rcu() before destroy */
kmem_cache_destroy(nf_ct_expect_cachep);
+ nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize);
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 3b40ec575cd5..f703adb7e5f7 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -38,10 +38,10 @@ unsigned int nf_ct_helper_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
static unsigned int nf_ct_helper_count __read_mostly;
-static bool nf_ct_auto_assign_helper __read_mostly = true;
+static bool nf_ct_auto_assign_helper __read_mostly = false;
module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
MODULE_PARM_DESC(nf_conntrack_helper,
- "Enable automatic conntrack helper assignment (default 1)");
+ "Enable automatic conntrack helper assignment (default 0)");
#ifdef CONFIG_SYSCTL
static struct ctl_table helper_sysctl_table[] = {
@@ -400,7 +400,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
- &net->ct.expect_hash[i], hnode) {
+ &nf_ct_expect_hash[i], hnode) {
struct nf_conn_help *help = nfct_help(exp->master);
if ((rcu_dereference_protected(
help->helper,
@@ -424,10 +424,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
spin_unlock_bh(&pcpu->lock);
}
local_bh_disable();
- for (i = 0; i < net->ct.htable_size; i++) {
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
- if (i < net->ct.htable_size) {
- hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+ if (i < nf_conntrack_htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
unhelp(h, me);
}
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 294a8e28cec4..a18d1ceabad5 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -824,19 +824,22 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
last = (struct nf_conn *)cb->args[1];
local_bh_disable();
- for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
+ for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
restart:
lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
- if (cb->args[0] >= net->ct.htable_size) {
+ if (cb->args[0] >= nf_conntrack_htable_size) {
spin_unlock(lockp);
goto out;
}
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
- hnnode) {
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
+ hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
+ if (!net_eq(net, nf_ct_net(ct)))
+ continue;
+
/* Dump entries of a given L3 protocol number.
* If it is not specified, ie. l3proto == 0,
* then dump everything. */
@@ -2629,10 +2632,14 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
last = (struct nf_conntrack_expect *)cb->args[1];
for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
- hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]],
+ hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]],
hnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
+
+ if (!net_eq(nf_ct_net(exp->master), net))
+ continue;
+
if (cb->args[1]) {
if (exp != last)
continue;
@@ -2883,8 +2890,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
- &net->ct.expect_hash[i],
+ &nf_ct_expect_hash[i],
hnode) {
+
+ if (!net_eq(nf_ct_exp_net(exp), net))
+ continue;
+
m_help = nfct_help(exp->master);
if (!strcmp(m_help->helper->name, name) &&
del_timer(&exp->timeout)) {
@@ -2901,8 +2912,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
- &net->ct.expect_hash[i],
+ &nf_ct_expect_hash[i],
hnode) {
+
+ if (!net_eq(nf_ct_exp_net(exp), net))
+ continue;
+
if (del_timer(&exp->timeout)) {
nf_ct_unlink_expect_report(exp,
NETLINK_CB(skb).portid,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 478f92f834b6..4fd040575ffe 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -309,6 +309,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDP,
.name = "udp",
+ .allow_clash = true,
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
@@ -341,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDP,
.name = "udp",
+ .allow_clash = true,
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 1ac8ee13a873..9d692f5adb94 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -274,6 +274,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDPLITE,
.name = "udplite",
+ .allow_clash = true,
.pkt_to_tuple = udplite_pkt_to_tuple,
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
@@ -306,6 +307,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDPLITE,
.name = "udplite",
+ .allow_clash = true,
.pkt_to_tuple = udplite_pkt_to_tuple,
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 0f1a45bcacb2..f87e84ebcec3 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -54,14 +54,13 @@ struct ct_iter_state {
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
- struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
struct hlist_nulls_node *n;
for (st->bucket = 0;
- st->bucket < net->ct.htable_size;
+ st->bucket < nf_conntrack_htable_size;
st->bucket++) {
- n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct hlist_nulls_node *head)
{
- struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= net->ct.htable_size)
+ if (++st->bucket >= nf_conntrack_htable_size)
return NULL;
}
head = rcu_dereference(
hlist_nulls_first_rcu(
- &net->ct.hash[st->bucket]));
+ &nf_conntrack_hash[st->bucket]));
}
return head;
}
@@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
},
{
.procname = "nf_conntrack_buckets",
- .data = &init_net.ct.htable_size,
+ .data = &nf_conntrack_htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
.proc_handler = proc_dointvec,
@@ -512,7 +510,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
goto out_kmemdup;
table[1].data = &net->ct.count;
- table[2].data = &net->ct.htable_size;
table[3].data = &net->ct.sysctl_checksum;
table[4].data = &net->ct.sysctl_log_invalid;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 06a9f45771ab..6877a396f8fc 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -38,6 +38,9 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly;
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
@@ -118,15 +121,17 @@ EXPORT_SYMBOL(nf_xfrm_me_harder);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
-hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
+ get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
+
/* Original src, to ensure we map it consistently if poss. */
hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
- tuple->dst.protonum ^ nf_conntrack_hash_rnd);
+ tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
- return reciprocal_scale(hash, net->ct.nat_htable_size);
+ return reciprocal_scale(hash, nf_nat_htable_size);
}
/* Is this tuple already taken? (not by us) */
@@ -196,9 +201,10 @@ find_appropriate_src(struct net *net,
const struct nf_conn_nat *nat;
const struct nf_conn *ct;
- hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
+ hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) {
ct = nat->ct;
if (same_src(ct, tuple) &&
+ net_eq(net, nf_ct_net(ct)) &&
nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
/* Copy source part from reply tuple. */
nf_ct_invert_tuplepr(result,
@@ -431,7 +437,7 @@ nf_nat_setup_info(struct nf_conn *ct,
nat = nfct_nat(ct);
nat->ct = ct;
hlist_add_head_rcu(&nat->bysource,
- &net->ct.nat_bysource[srchash]);
+ &nf_nat_bysource[srchash]);
spin_unlock_bh(&nf_nat_lock);
}
@@ -819,27 +825,14 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
}
#endif
-static int __net_init nf_nat_net_init(struct net *net)
-{
- /* Leave them the same for the moment. */
- net->ct.nat_htable_size = net->ct.htable_size;
- net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
- if (!net->ct.nat_bysource)
- return -ENOMEM;
- return 0;
-}
-
static void __net_exit nf_nat_net_exit(struct net *net)
{
struct nf_nat_proto_clean clean = {};
nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);
- synchronize_rcu();
- nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
}
static struct pernet_operations nf_nat_net_ops = {
- .init = nf_nat_net_init,
.exit = nf_nat_net_exit,
};
@@ -852,8 +845,16 @@ static int __init nf_nat_init(void)
{
int ret;
+ /* Leave them the same for the moment. */
+ nf_nat_htable_size = nf_conntrack_htable_size;
+
+ nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+ if (!nf_nat_bysource)
+ return -ENOMEM;
+
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret;
}
@@ -877,6 +878,7 @@ static int __init nf_nat_init(void)
return 0;
cleanup_extend:
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
nf_ct_extend_unregister(&nat_extend);
return ret;
}
@@ -895,6 +897,7 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
synchronize_net();
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
}
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7a85a9dd37ad..4d292b933b5c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2317,7 +2317,7 @@ nft_select_set_ops(const struct nlattr * const nla[],
static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_TABLE] = { .type = NLA_STRING },
[NFTA_SET_NAME] = { .type = NLA_STRING,
- .len = IFNAMSIZ - 1 },
+ .len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_SET_FLAGS] = { .type = NLA_U32 },
[NFTA_SET_KEY_TYPE] = { .type = NLA_U32 },
[NFTA_SET_KEY_LEN] = { .type = NLA_U32 },
@@ -2401,7 +2401,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
unsigned long *inuse;
unsigned int n = 0, min = 0;
- p = strnchr(name, IFNAMSIZ, '%');
+ p = strnchr(name, NFT_SET_MAXNAMELEN, '%');
if (p != NULL) {
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
@@ -2696,7 +2696,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
- char name[IFNAMSIZ];
+ char name[NFT_SET_MAXNAMELEN];
unsigned int size;
bool create;
u64 timeout;
@@ -3375,6 +3375,22 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem)
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
+static int nft_setelem_parse_flags(const struct nft_set *set,
+ const struct nlattr *attr, u32 *flags)
+{
+ if (attr == NULL)
+ return 0;
+
+ *flags = ntohl(nla_get_be32(attr));
+ if (*flags & ~NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+ if (!(set->flags & NFT_SET_INTERVAL) &&
+ *flags & NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+
+ return 0;
+}
+
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
@@ -3388,8 +3404,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data data;
enum nft_registers dreg;
struct nft_trans *trans;
+ u32 flags = 0;
u64 timeout;
- u32 flags;
u8 ulen;
int err;
@@ -3403,17 +3419,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_prepare(&tmpl);
- flags = 0;
- if (nla[NFTA_SET_ELEM_FLAGS] != NULL) {
- flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS]));
- if (flags & ~NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
- if (!(set->flags & NFT_SET_INTERVAL) &&
- flags & NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
- }
+ err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
+ if (err < 0)
+ return err;
+ if (flags != 0)
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
if (set->flags & NFT_SET_MAP) {
if (nla[NFTA_SET_ELEM_DATA] == NULL &&
@@ -3582,9 +3592,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ struct nft_set_ext_tmpl tmpl;
struct nft_data_desc desc;
struct nft_set_elem elem;
+ struct nft_set_ext *ext;
struct nft_trans *trans;
+ u32 flags = 0;
+ void *priv;
int err;
err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
@@ -3596,6 +3610,14 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_KEY] == NULL)
goto err1;
+ nft_set_ext_prepare(&tmpl);
+
+ err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
+ if (err < 0)
+ return err;
+ if (flags != 0)
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+
err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
@@ -3605,24 +3627,40 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
goto err2;
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len);
+
+ err = -ENOMEM;
+ elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0,
+ GFP_KERNEL);
+ if (elem.priv == NULL)
+ goto err2;
+
+ ext = nft_set_elem_ext(set, elem.priv);
+ if (flags)
+ *nft_set_ext_flags(ext) = flags;
+
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
if (trans == NULL) {
err = -ENOMEM;
- goto err2;
+ goto err3;
}
- elem.priv = set->ops->deactivate(set, &elem);
- if (elem.priv == NULL) {
+ priv = set->ops->deactivate(set, &elem);
+ if (priv == NULL) {
err = -ENOENT;
- goto err3;
+ goto err4;
}
+ kfree(elem.priv);
+ elem.priv = priv;
nft_trans_elem(trans) = elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
-err3:
+err4:
kfree(trans);
+err3:
+ kfree(elem.priv);
err2:
nft_data_uninit(&elem.key.val, desc.type);
err1:
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 2671b9deb103..3c84f14326f5 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -306,10 +306,10 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
int i;
local_bh_disable();
- for (i = 0; i < net->ct.htable_size; i++) {
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
- if (i < net->ct.htable_size) {
- hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+ if (i < nf_conntrack_htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
untimeout(h, timeout);
}
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 25998facefd0..137e308d5b24 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -198,6 +198,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
}
break;
#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ nf_connlabels_replace(ct,
+ &regs->data[priv->sreg],
+ &regs->data[priv->sreg],
+ NF_CT_LABELS_MAX_SIZE / sizeof(u32));
+ break;
+#endif
default:
break;
}
@@ -365,6 +373,16 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
len = FIELD_SIZEOF(struct nf_conn, mark);
break;
#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ if (tb[NFTA_CT_DIRECTION])
+ return -EINVAL;
+ len = NF_CT_LABELS_MAX_SIZE;
+ err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
+ if (err)
+ return err;
+ break;
+#endif
default:
return -EOPNOTSUPP;
}
@@ -384,6 +402,18 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
static void nft_ct_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
+ struct nft_ct *priv = nft_expr_priv(expr);
+
+ switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ nf_connlabels_put(ctx->net);
+ break;
+#endif
+ default:
+ break;
+ }
+
nft_ct_l3proto_module_put(ctx->afi->family);
}
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
index 1c30f41cff5b..f762094af7c1 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_rbtree.c
@@ -29,6 +29,17 @@ struct nft_rbtree_elem {
struct nft_set_ext ext;
};
+static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
+{
+ return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
+ (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
+}
+
+static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
+ const struct nft_rbtree_elem *interval)
+{
+ return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
+}
static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
const struct nft_set_ext **ext)
@@ -37,6 +48,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
const struct nft_rbtree_elem *rbe, *interval = NULL;
const struct rb_node *parent;
u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
+ const void *this;
int d;
spin_lock_bh(&nft_rbtree_lock);
@@ -44,9 +56,16 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
while (parent != NULL) {
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
- d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
+ this = nft_set_ext_key(&rbe->ext);
+ d = memcmp(this, key, set->klen);
if (d < 0) {
parent = parent->rb_left;
+ /* In case of adjacent ranges, we always see the high
+ * part of the range in first place, before the low one.
+ * So don't update interval if the keys are equal.
+ */
+ if (interval && nft_rbtree_equal(set, this, interval))
+ continue;
interval = rbe;
} else if (d > 0)
parent = parent->rb_right;
@@ -56,9 +75,7 @@ found:
parent = parent->rb_left;
continue;
}
- if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
- *nft_set_ext_flags(&rbe->ext) &
- NFT_SET_ELEM_INTERVAL_END)
+ if (nft_rbtree_interval_end(rbe))
goto out;
spin_unlock_bh(&nft_rbtree_lock);
@@ -98,9 +115,16 @@ static int __nft_rbtree_insert(const struct nft_set *set,
else if (d > 0)
p = &parent->rb_right;
else {
- if (nft_set_elem_active(&rbe->ext, genmask))
- return -EEXIST;
- p = &parent->rb_left;
+ if (nft_set_elem_active(&rbe->ext, genmask)) {
+ if (nft_rbtree_interval_end(rbe) &&
+ !nft_rbtree_interval_end(new))
+ p = &parent->rb_left;
+ else if (!nft_rbtree_interval_end(rbe) &&
+ nft_rbtree_interval_end(new))
+ p = &parent->rb_right;
+ else
+ return -EEXIST;
+ }
}
}
rb_link_node(&new->node, parent, p);
@@ -145,7 +169,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set,
{
const struct nft_rbtree *priv = nft_set_priv(set);
const struct rb_node *parent = priv->root.rb_node;
- struct nft_rbtree_elem *rbe;
+ struct nft_rbtree_elem *rbe, *this = elem->priv;
u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
int d;
@@ -163,6 +187,15 @@ static void *nft_rbtree_deactivate(const struct nft_set *set,
parent = parent->rb_left;
continue;
}
+ if (nft_rbtree_interval_end(rbe) &&
+ !nft_rbtree_interval_end(this)) {
+ parent = parent->rb_left;
+ continue;
+ } else if (!nft_rbtree_interval_end(rbe) &&
+ nft_rbtree_interval_end(this)) {
+ parent = parent->rb_right;
+ continue;
+ }
nft_set_elem_change_active(set, &rbe->ext);
return rbe;
}