summaryrefslogtreecommitdiff
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c51
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c162
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c82
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c15
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c23
-rw-r--r--net/netfilter/nf_conntrack_core.c430
-rw-r--r--net/netfilter/nf_conntrack_ecache.c84
-rw-r--r--net/netfilter/nf_conntrack_expect.c83
-rw-r--r--net/netfilter/nf_conntrack_helper.c12
-rw-r--r--net/netfilter/nf_conntrack_labels.c44
-rw-r--r--net/netfilter/nf_conntrack_netlink.c166
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c4
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c8
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c8
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c2
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c2
-rw-r--r--net/netfilter/nf_conntrack_standalone.c13
-rw-r--r--net/netfilter/nf_nat_core.c39
-rw-r--r--net/netfilter/nf_tables_api.c102
-rw-r--r--net/netfilter/nf_tables_trace.c5
-rw-r--r--net/netfilter/nfnetlink_acct.c9
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c6
-rw-r--r--net/netfilter/nfnetlink_queue.c105
-rw-r--r--net/netfilter/nft_counter.c6
-rw-r--r--net/netfilter/nft_ct.c32
-rw-r--r--net/netfilter/nft_dynset.c3
-rw-r--r--net/netfilter/nft_hash.c4
-rw-r--r--net/netfilter/nft_limit.c6
-rw-r--r--net/netfilter/nft_rbtree.c49
-rw-r--r--net/netfilter/x_tables.c245
-rw-r--r--net/netfilter/xt_connlabel.c14
-rw-r--r--net/netfilter/xt_socket.c6
33 files changed, 1333 insertions, 491 deletions
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 85ca189bdc3d..2cb3c626cd43 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key)
spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
+static void ip_vs_conn_expire(unsigned long data);
/*
* Returns hash value for IPVS connection entry
@@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
}
EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
+static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp)
+{
+ __ip_vs_conn_put(cp);
+ ip_vs_conn_expire((unsigned long)cp);
+}
+
/*
* Put back the conn and restart its timer with its timeout
*/
-void ip_vs_conn_put(struct ip_vs_conn *cp)
+static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp)
{
unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
0 : cp->timeout;
@@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
__ip_vs_conn_put(cp);
}
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+ if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) &&
+ (atomic_read(&cp->refcnt) == 1) &&
+ !timer_pending(&cp->timer))
+ /* expire connection immediately */
+ __ip_vs_conn_put_notimer(cp);
+ else
+ __ip_vs_conn_put_timer(cp);
+}
/*
* Fill a no_client_port connection with a client port number
@@ -819,7 +836,8 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->control)
ip_vs_control_del(cp);
- if (cp->flags & IP_VS_CONN_F_NFCT) {
+ if ((cp->flags & IP_VS_CONN_F_NFCT) &&
+ !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
/* Do not access conntracks during subsys cleanup
* because nf_conntrack_find_get can not be used after
* conntrack cleanup for the net.
@@ -834,7 +852,10 @@ static void ip_vs_conn_expire(unsigned long data)
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
- call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ ip_vs_conn_rcu_free(&cp->rcu_head);
+ else
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
atomic_dec(&ipvs->conn_count);
return;
}
@@ -850,7 +871,7 @@ static void ip_vs_conn_expire(unsigned long data)
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
- ip_vs_conn_put(cp);
+ __ip_vs_conn_put_timer(cp);
}
/* Modify timer, so that it expires as soon as possible.
@@ -1240,6 +1261,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
return 1;
}
+static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
+{
+ struct ip_vs_service *svc;
+
+ if (!cp->dest)
+ return false;
+ svc = rcu_dereference(cp->dest->svc);
+ return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
+}
+
/* Called from keventd and must protect itself from softirqs */
void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
{
@@ -1254,11 +1285,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- /* connection template */
- continue;
if (cp->ipvs != ipvs)
continue;
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+ if (atomic_read(&cp->n_control) ||
+ !ip_vs_conn_ops_mode(cp))
+ continue;
+ else
+ /* connection template of OPS */
+ goto try_drop;
+ }
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
case IP_VS_TCP_S_SYN_RECV:
@@ -1286,6 +1322,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
continue;
}
} else {
+try_drop:
if (!todrop_entry(cp))
continue;
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b9a4082afa3a..1207f20d24e4 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put);
#ifdef CONFIG_IP_VS_DEBUG
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
+EXPORT_SYMBOL(ip_vs_new_conn_out);
static int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */
@@ -611,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ret = cp->packet_xmit(skb, cp, pd->pp, iph);
/* do not touch skb anymore */
- atomic_inc(&cp->in_pkts);
+ if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
+ atomic_inc(&cp->control->in_pkts);
+ else
+ atomic_inc(&cp->in_pkts);
ip_vs_conn_put(cp);
return ret;
}
@@ -1100,6 +1104,143 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
}
}
+/* Generic function to create new connections for outgoing RS packets
+ *
+ * Pre-requisites for successful connection creation:
+ * 1) Virtual Service is NOT fwmark based:
+ * In fwmark-VS actual vaddr and vport are unknown to IPVS
+ * 2) Real Server and Virtual Service were NOT configured without port:
+ * This is to allow match of different VS to the same RS ip-addr
+ */
+struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ __be16 dport,
+ __be16 cport)
+{
+ struct ip_vs_conn_param param;
+ struct ip_vs_conn *ct = NULL, *cp = NULL;
+ const union nf_inet_addr *vaddr, *daddr, *caddr;
+ union nf_inet_addr snet;
+ __be16 vport;
+ unsigned int flags;
+
+ EnterFunction(12);
+ vaddr = &svc->addr;
+ vport = svc->port;
+ daddr = &iph->saddr;
+ caddr = &iph->daddr;
+
+ /* check pre-requisites are satisfied */
+ if (svc->fwmark)
+ return NULL;
+ if (!vport || !dport)
+ return NULL;
+
+ /* for persistent service first create connection template */
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+ /* apply netmask the same way ingress-side does */
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ ipv6_addr_prefix(&snet.in6, &caddr->in6,
+ (__force __u32)svc->netmask);
+ else
+#endif
+ snet.ip = caddr->ip & svc->netmask;
+ /* fill params and create template if not existent */
+ if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol,
+ &snet, 0, vaddr,
+ vport, &param) < 0)
+ return NULL;
+ ct = ip_vs_ct_in_get(&param);
+ if (!ct) {
+ ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
+ IP_VS_CONN_F_TEMPLATE, dest, 0);
+ if (!ct) {
+ kfree(param.pe_data);
+ return NULL;
+ }
+ ct->timeout = svc->timeout;
+ } else {
+ kfree(param.pe_data);
+ }
+ }
+
+ /* connection flags */
+ flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) &&
+ iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0;
+ /* create connection */
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
+ caddr, cport, vaddr, vport, &param);
+ cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0);
+ if (!cp) {
+ if (ct)
+ ip_vs_conn_put(ct);
+ return NULL;
+ }
+ if (ct) {
+ ip_vs_control_add(cp, ct);
+ ip_vs_conn_put(ct);
+ }
+ ip_vs_conn_stats(cp, svc);
+
+ /* return connection (will be used to handle outgoing packet) */
+ IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u "
+ "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
+ ip_vs_fwd_tag(cp),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+ cp->flags, atomic_read(&cp->refcnt));
+ LeaveFunction(12);
+ return cp;
+}
+
+/* Handle outgoing packets which are considered requests initiated by
+ * real servers, so that subsequent responses from external client can be
+ * routed to the right real server.
+ * Used also for outgoing responses in OPS mode.
+ *
+ * Connection management is handled by persistent-engine specific callback.
+ */
+static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum,
+ struct netns_ipvs *ipvs,
+ int af, struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_conn *cp = NULL;
+ __be16 _ports[2], *pptr;
+
+ if (hooknum == NF_INET_LOCAL_IN)
+ return NULL;
+
+ pptr = frag_safe_skb_hp(skb, iph->len,
+ sizeof(_ports), _ports, iph);
+ if (!pptr)
+ return NULL;
+
+ rcu_read_lock();
+ dest = ip_vs_find_real_service(ipvs, af, iph->protocol,
+ &iph->saddr, pptr[0]);
+ if (dest) {
+ struct ip_vs_service *svc;
+ struct ip_vs_pe *pe;
+
+ svc = rcu_dereference(dest->svc);
+ if (svc) {
+ pe = rcu_dereference(svc->pe);
+ if (pe && pe->conn_out)
+ cp = pe->conn_out(svc, dest, skb, iph,
+ pptr[0], pptr[1]);
+ }
+ }
+ rcu_read_unlock();
+
+ return cp;
+}
+
/* Handle response packets: rewrite addresses and send away...
*/
static unsigned int
@@ -1245,6 +1386,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
+
+ /* Check for real-server-started requests */
+ if (atomic_read(&ipvs->conn_out_counter)) {
+ /* Currently only for UDP:
+ * connection oriented protocols typically use
+ * ephemeral ports for outgoing connections, so
+ * related incoming responses would not match any VS
+ */
+ if (pp->protocol == IPPROTO_UDP) {
+ cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
+ if (likely(cp))
+ return handle_response(af, skb, pd, cp, &iph,
+ hooknum);
+ }
+ }
+
if (sysctl_nat_icmp_send(ipvs) &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
@@ -1837,6 +1994,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, pkts);
+ else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
+ /* increment is done inside ip_vs_sync_conn too */
+ atomic_inc(&cp->control->in_pkts);
ip_vs_conn_put(cp);
return ret;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 404b2a4f4b5b..c3c809b2e712 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
return false;
}
+/* Find real service record by <proto,addr,port>.
+ * In case of multiple records with the same <proto,addr,port>, only
+ * the first found record is returned.
+ *
+ * To be called under RCU lock.
+ */
+struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
+ __u16 protocol,
+ const union nf_inet_addr *daddr,
+ __be16 dport)
+{
+ unsigned int hash;
+ struct ip_vs_dest *dest;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_rs_hashkey(af, daddr, dport);
+
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->port == dport &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ (dest->protocol == protocol || dest->vfwmark)) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
/* Lookup destination by {addr,port} in the given service
* Called under RCU lock.
*/
@@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
+ if (svc->pe && svc->pe->conn_out)
+ atomic_inc(&ipvs->conn_out_counter);
ip_vs_start_estimator(ipvs, &svc->stats);
@@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
struct ip_vs_scheduler *sched = NULL, *old_sched;
struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0;
+ bool new_pe_conn_out, old_pe_conn_out;
/*
* Lookup the scheduler, by 'u->sched_name'
@@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
svc->netmask = u->netmask;
old_pe = rcu_dereference_protected(svc->pe, 1);
- if (pe != old_pe)
+ if (pe != old_pe) {
rcu_assign_pointer(svc->pe, pe);
+ /* check for optional methods in new pe */
+ new_pe_conn_out = (pe && pe->conn_out) ? true : false;
+ old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
+ if (new_pe_conn_out && !old_pe_conn_out)
+ atomic_inc(&svc->ipvs->conn_out_counter);
+ if (old_pe_conn_out && !new_pe_conn_out)
+ atomic_dec(&svc->ipvs->conn_out_counter);
+ }
out:
ip_vs_scheduler_put(old_sched);
@@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/* Unbind persistence engine, keep svc->pe */
old_pe = rcu_dereference_protected(svc->pe, 1);
+ if (old_pe && old_pe->conn_out)
+ atomic_dec(&ipvs->conn_out_counter);
ip_vs_pe_put(old_pe);
/*
@@ -2875,8 +2918,10 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
+ IPVS_STATS_ATTR_PAD) ||
nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
@@ -2900,16 +2945,26 @@ static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
if (!nl_stats)
return -EMSGSIZE;
- if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps))
+ if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps,
+ IPVS_STATS_ATTR_PAD))
goto nla_put_failure;
nla_nest_end(skb, nl_stats);
@@ -3957,6 +4012,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
(unsigned long) ipvs);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
+ atomic_set(&ipvs->conn_out_counter, 0);
/* procfs stats */
ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 30434fb133df..f04fd8df210b 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
return;
+ /* Never alter conntrack for OPS conns (no reply is expected) */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return;
+
/* Alter reply only in original direction */
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return;
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 0a6eb5c0d9e9..d07ef9e31c12 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
return cp->pe_data_len;
}
+static struct ip_vs_conn *
+ip_vs_sip_conn_out(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ __be16 dport,
+ __be16 cport)
+{
+ if (likely(iph->protocol == IPPROTO_UDP))
+ return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport);
+ /* currently no need to handle other than UDP */
+ return NULL;
+}
+
static struct ip_vs_pe ip_vs_sip_pe =
{
.name = "sip",
@@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe =
.ct_match = ip_vs_sip_ct_match,
.hashkey_raw = ip_vs_sip_hashkey_raw,
.show_pe_data = ip_vs_sip_show_pe_data,
+ .conn_out = ip_vs_sip_conn_out,
};
static int __init ip_vs_sip_init(void)
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index dc196a0f501d..01d3d894de46 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -932,17 +932,14 @@ error:
static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
{
- if (encaps_af == AF_INET) {
- if (orig_af == AF_INET)
- return SKB_GSO_IPIP;
-
- return SKB_GSO_SIT;
+ switch (encaps_af) {
+ case AF_INET:
+ return SKB_GSO_IPXIP4;
+ case AF_INET6:
+ return SKB_GSO_IPXIP6;
+ default:
+ return 0;
}
-
- /* GSO: we need to provide proper SKB_GSO_ value for IPv6:
- * SKB_GSO_SIT/IPV6
- */
- return 0;
}
/*
@@ -1013,8 +1010,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if (IS_ERR(skb))
goto tx_error;
- skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af));
- if (IS_ERR(skb))
+ if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)))
goto tx_error;
skb->transport_header = skb->network_header;
@@ -1105,8 +1101,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (IS_ERR(skb))
goto tx_error;
- skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af));
- if (IS_ERR(skb))
+ if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)))
goto tx_error;
skb->transport_header = skb->network_header;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 895d11dced3c..db2312eeb2a4 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -12,6 +12,8 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/module.h>
@@ -52,6 +54,7 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_helper.h>
+#include <net/netns/hash.h>
#define NF_CONNTRACK_VERSION "0.5.0"
@@ -66,6 +69,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
+struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_hash);
+
+static __read_mostly struct kmem_cache *nf_conntrack_cachep;
+static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
+static __read_mostly seqcount_t nf_conntrack_generation;
static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
@@ -105,7 +114,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
spin_lock_nested(&nf_conntrack_locks[h1],
SINGLE_DEPTH_NESTING);
}
- if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
nf_conntrack_double_unlock(h1, h2);
return true;
}
@@ -139,43 +148,43 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
-unsigned int nf_conntrack_hash_rnd __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
-static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
+ const struct net *net)
{
unsigned int n;
+ u32 seed;
+
+ get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
/* The direction must be ignored, so we hash everything up to the
* destination ports (which is a multiple of 4) and treat the last
* three bytes manually.
*/
+ seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^
+ return jhash2((u32 *)tuple, n, seed ^
(((__force __u16)tuple->dst.u.all << 16) |
tuple->dst.protonum));
}
-static u32 __hash_bucket(u32 hash, unsigned int size)
-{
- return reciprocal_scale(hash, size);
-}
-
-static u32 hash_bucket(u32 hash, const struct net *net)
+static u32 scale_hash(u32 hash)
{
- return __hash_bucket(hash, net->ct.htable_size);
+ return reciprocal_scale(hash, nf_conntrack_htable_size);
}
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- unsigned int size)
+static u32 __hash_conntrack(const struct net *net,
+ const struct nf_conntrack_tuple *tuple,
+ unsigned int size)
{
- return __hash_bucket(hash_conntrack_raw(tuple), size);
+ return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
}
-static inline u_int32_t hash_conntrack(const struct net *net,
- const struct nf_conntrack_tuple *tuple)
+static u32 hash_conntrack(const struct net *net,
+ const struct nf_conntrack_tuple *tuple)
{
- return __hash_conntrack(tuple, net->ct.htable_size);
+ return scale_hash(hash_conntrack_raw(tuple, net));
}
bool
@@ -356,7 +365,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
}
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
- if (l4proto && l4proto->destroy)
+ if (l4proto->destroy)
l4proto->destroy(ct);
rcu_read_unlock();
@@ -391,7 +400,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
reply_hash = hash_conntrack(net,
@@ -443,7 +452,8 @@ static void death_by_timeout(unsigned long ul_conntrack)
static inline bool
nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const struct nf_conntrack_zone *zone,
+ const struct net *net)
{
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
@@ -452,7 +462,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
*/
return nf_ct_tuple_equal(tuple, &h->tuple) &&
nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
- nf_ct_is_confirmed(ct);
+ nf_ct_is_confirmed(ct) &&
+ net_eq(net, nf_ct_net(ct));
}
/*
@@ -465,21 +476,23 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
- unsigned int bucket = hash_bucket(hash, net);
+ unsigned int bucket, sequence;
- /* Disable BHs the entire time since we normally need to disable them
- * at least once for the stats anyway.
- */
- local_bh_disable();
begin:
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
- if (nf_ct_key_equal(h, tuple, zone)) {
- NF_CT_STAT_INC(net, found);
- local_bh_enable();
+ do {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ bucket = scale_hash(hash);
+ ct_hash = nf_conntrack_hash;
+ } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+ hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
+ if (nf_ct_key_equal(h, tuple, zone, net)) {
+ NF_CT_STAT_INC_ATOMIC(net, found);
return h;
}
- NF_CT_STAT_INC(net, searched);
+ NF_CT_STAT_INC_ATOMIC(net, searched);
}
/*
* if the nulls value we got at the end of this lookup is
@@ -487,10 +500,9 @@ begin:
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(n) != bucket) {
- NF_CT_STAT_INC(net, search_restart);
+ NF_CT_STAT_INC_ATOMIC(net, search_restart);
goto begin;
}
- local_bh_enable();
return NULL;
}
@@ -512,7 +524,7 @@ begin:
!atomic_inc_not_zero(&ct->ct_general.use)))
h = NULL;
else {
- if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {
+ if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
nf_ct_put(ct);
goto begin;
}
@@ -528,7 +540,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
return __nf_conntrack_find_get(net, zone, tuple,
- hash_conntrack_raw(tuple));
+ hash_conntrack_raw(tuple, net));
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
@@ -536,12 +548,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
unsigned int hash,
unsigned int reply_hash)
{
- struct net *net = nf_ct_net(ct);
-
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &net->ct.hash[hash]);
+ &nf_conntrack_hash[hash]);
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
- &net->ct.hash[reply_hash]);
+ &nf_conntrack_hash[reply_hash]);
}
int
@@ -558,7 +568,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
reply_hash = hash_conntrack(net,
@@ -566,17 +576,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* See if there's one in the list already, including reverse */
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
goto out;
add_timer(&ct->timeout);
@@ -597,6 +604,62 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
+static inline void nf_ct_acct_update(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int len)
+{
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
+ }
+}
+
+static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ const struct nf_conn *loser_ct)
+{
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(loser_ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+ unsigned int bytes;
+
+ /* u32 should be fine since we must have seen one packet. */
+ bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
+ nf_ct_acct_update(ct, ctinfo, bytes);
+ }
+}
+
+/* Resolve race on insertion if this protocol allows this. */
+static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_tuple_hash *h)
+{
+ /* This is the conntrack entry already in hashes that won race. */
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ struct nf_conntrack_l4proto *l4proto;
+
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto->allow_clash &&
+ !nf_ct_is_dying(ct) &&
+ atomic_inc_not_zero(&ct->ct_general.use)) {
+ nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
+ nf_conntrack_put(skb->nfct);
+ /* Assign conntrack already in hashes to this skbuff. Don't
+ * modify skb->nfctinfo to ensure consistent stateful filtering.
+ */
+ skb->nfct = &ct->ct_general;
+ return NF_ACCEPT;
+ }
+ NF_CT_STAT_INC(net, drop);
+ return NF_DROP;
+}
+
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
@@ -611,6 +674,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
enum ip_conntrack_info ctinfo;
struct net *net;
unsigned int sequence;
+ int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);
@@ -626,10 +690,10 @@ __nf_conntrack_confirm(struct sk_buff *skb)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
/* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
- hash = hash_bucket(hash, net);
+ hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -653,23 +717,22 @@ __nf_conntrack_confirm(struct sk_buff *skb)
*/
nf_ct_del_from_dying_or_unconfirmed_list(ct);
- if (unlikely(nf_ct_is_dying(ct)))
- goto out;
+ if (unlikely(nf_ct_is_dying(ct))) {
+ nf_ct_add_to_dying_list(ct);
+ goto dying;
+ }
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
goto out;
/* Timer relative to confirmation time, not original
@@ -708,10 +771,12 @@ __nf_conntrack_confirm(struct sk_buff *skb)
out:
nf_ct_add_to_dying_list(ct);
+ ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
+dying:
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
- return NF_DROP;
+ return ret;
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
@@ -724,29 +789,31 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
struct net *net = nf_ct_net(ignored_conntrack);
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_head *ct_hash;
+ unsigned int hash, sequence;
struct hlist_nulls_node *n;
struct nf_conn *ct;
- unsigned int hash;
zone = nf_ct_zone(ignored_conntrack);
- hash = hash_conntrack(net, tuple);
- /* Disable BHs the entire time since we need to disable them at
- * least once for the stats anyway.
- */
- rcu_read_lock_bh();
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+ rcu_read_lock();
+ do {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ hash = hash_conntrack(net, tuple);
+ ct_hash = nf_conntrack_hash;
+ } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+ hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (ct != ignored_conntrack &&
- nf_ct_tuple_equal(tuple, &h->tuple) &&
- nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) {
- NF_CT_STAT_INC(net, found);
- rcu_read_unlock_bh();
+ nf_ct_key_equal(h, tuple, zone, net)) {
+ NF_CT_STAT_INC_ATOMIC(net, found);
+ rcu_read_unlock();
return 1;
}
- NF_CT_STAT_INC(net, searched);
+ NF_CT_STAT_INC_ATOMIC(net, searched);
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return 0;
}
@@ -760,71 +827,63 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
{
/* Use oldest entry, which is roughly LRU */
struct nf_conntrack_tuple_hash *h;
- struct nf_conn *ct = NULL, *tmp;
+ struct nf_conn *tmp;
struct hlist_nulls_node *n;
- unsigned int i = 0, cnt = 0;
- int dropped = 0;
- unsigned int hash, sequence;
+ unsigned int i, hash, sequence;
+ struct nf_conn *ct = NULL;
spinlock_t *lockp;
+ bool ret = false;
+
+ i = 0;
local_bh_disable();
restart:
- sequence = read_seqcount_begin(&net->ct.generation);
- hash = hash_bucket(_hash, net);
- for (; i < net->ct.htable_size; i++) {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ for (; i < NF_CT_EVICTION_RANGE; i++) {
+ hash = scale_hash(_hash++);
lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
- if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
spin_unlock(lockp);
goto restart;
}
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
- hnnode) {
+ hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash],
+ hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
- !nf_ct_is_dying(tmp) &&
- atomic_inc_not_zero(&tmp->ct_general.use)) {
+
+ if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
+ !net_eq(nf_ct_net(tmp), net) ||
+ nf_ct_is_dying(tmp))
+ continue;
+
+ if (atomic_inc_not_zero(&tmp->ct_general.use)) {
ct = tmp;
break;
}
- cnt++;
}
- hash = (hash + 1) % net->ct.htable_size;
spin_unlock(lockp);
-
- if (ct || cnt >= NF_CT_EVICTION_RANGE)
+ if (ct)
break;
-
}
+
local_bh_enable();
if (!ct)
- return dropped;
+ return false;
- if (del_timer(&ct->timeout)) {
+ /* kill only if in same netns -- might have moved due to
+ * SLAB_DESTROY_BY_RCU rules
+ */
+ if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) {
if (nf_ct_delete(ct, 0, 0)) {
- dropped = 1;
NF_CT_STAT_INC_ATOMIC(net, early_drop);
+ ret = true;
}
}
- nf_ct_put(ct);
- return dropped;
-}
-
-void init_nf_conntrack_hash_rnd(void)
-{
- unsigned int rand;
- /*
- * Why not initialize nf_conntrack_rnd in a "init()" function ?
- * Because there isn't enough entropy when system initializing,
- * and we initialize it as late as possible.
- */
- do {
- get_random_bytes(&rand, sizeof(rand));
- } while (!rand);
- cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+ nf_ct_put(ct);
+ return ret;
}
static struct nf_conn *
@@ -836,12 +895,6 @@ __nf_conntrack_alloc(struct net *net,
{
struct nf_conn *ct;
- if (unlikely(!nf_conntrack_hash_rnd)) {
- init_nf_conntrack_hash_rnd();
- /* recompute the hash as nf_conntrack_hash_rnd is initialized */
- hash = hash_conntrack_raw(orig);
- }
-
/* We don't want any race condition at early drop stage */
atomic_inc(&net->ct.count);
@@ -858,7 +911,7 @@ __nf_conntrack_alloc(struct net *net,
* Do not use kmem_cache_zalloc(), as this cache uses
* SLAB_DESTROY_BY_RCU.
*/
- ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
+ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
if (ct == NULL)
goto out;
@@ -885,7 +938,7 @@ __nf_conntrack_alloc(struct net *net,
atomic_set(&ct->ct_general.use, 0);
return ct;
out_free:
- kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ kmem_cache_free(nf_conntrack_cachep, ct);
out:
atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
@@ -912,7 +965,7 @@ void nf_conntrack_free(struct nf_conn *ct)
nf_ct_ext_destroy(ct);
nf_ct_ext_free(ct);
- kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ kmem_cache_free(nf_conntrack_cachep, ct);
smp_mb__before_atomic();
atomic_dec(&net->ct.count);
}
@@ -966,7 +1019,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
if (!l4proto->new(ct, skb, dataoff, timeouts)) {
nf_conntrack_free(ct);
- pr_debug("init conntrack: can't track with proto module\n");
+ pr_debug("can't track with proto module\n");
return NULL;
}
@@ -988,7 +1041,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
spin_lock(&nf_conntrack_expect_lock);
exp = nf_ct_find_expectation(net, zone, tuple);
if (exp) {
- pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+ pr_debug("expectation arrives ct=%p exp=%p\n",
ct, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &ct->status);
@@ -1053,13 +1106,13 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, net, &tuple, l3proto,
l4proto)) {
- pr_debug("resolve_normal_ct: Can't get tuple\n");
+ pr_debug("Can't get tuple\n");
return NULL;
}
/* look for tuple match */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
- hash = hash_conntrack_raw(&tuple);
+ hash = hash_conntrack_raw(&tuple, net);
h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
@@ -1079,14 +1132,13 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
} else {
/* Once we've had two way comms, always ESTABLISHED. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
+ pr_debug("normal packet for %p\n", ct);
*ctinfo = IP_CT_ESTABLISHED;
} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
- pr_debug("nf_conntrack_in: related packet for %p\n",
- ct);
+ pr_debug("related packet for %p\n", ct);
*ctinfo = IP_CT_RELATED;
} else {
- pr_debug("nf_conntrack_in: new packet for %p\n", ct);
+ pr_debug("new packet for %p\n", ct);
*ctinfo = IP_CT_NEW;
}
*set_reply = 0;
@@ -1269,17 +1321,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
}
acct:
- if (do_acct) {
- struct nf_conn_acct *acct;
-
- acct = nf_conn_acct_find(ct);
- if (acct) {
- struct nf_conn_counter *counter = acct->counter;
-
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);
- }
- }
+ if (do_acct)
+ nf_ct_acct_update(ct, ctinfo, skb->len);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -1288,18 +1331,8 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
const struct sk_buff *skb,
int do_acct)
{
- if (do_acct) {
- struct nf_conn_acct *acct;
-
- acct = nf_conn_acct_find(ct);
- if (acct) {
- struct nf_conn_counter *counter = acct->counter;
-
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(skb->len - skb_network_offset(skb),
- &counter[CTINFO2DIR(ctinfo)].bytes);
- }
- }
+ if (do_acct)
+ nf_ct_acct_update(ct, ctinfo, skb->len);
if (del_timer(&ct->timeout)) {
ct->timeout.function((unsigned long)ct);
@@ -1395,16 +1428,17 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
int cpu;
spinlock_t *lockp;
- for (; *bucket < net->ct.htable_size; (*bucket)++) {
+ for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
nf_conntrack_lock(lockp);
- if (*bucket < net->ct.htable_size) {
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+ if (*bucket < nf_conntrack_htable_size) {
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (iter(ct, data))
+ if (net_eq(nf_ct_net(ct), net) &&
+ iter(ct, data))
goto found;
}
}
@@ -1442,6 +1476,9 @@ void nf_ct_iterate_cleanup(struct net *net,
might_sleep();
+ if (atomic_read(&net->ct.count) == 0)
+ return;
+
while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
if (del_timer(&ct->timeout))
@@ -1493,6 +1530,8 @@ void nf_conntrack_cleanup_end(void)
while (untrack_refs() > 0)
schedule();
+ nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+
#ifdef CONFIG_NF_CONNTRACK_ZONES
nf_ct_extend_unregister(&nf_ct_zone_extend);
#endif
@@ -1543,15 +1582,12 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
nf_conntrack_proto_pernet_fini(net);
nf_conntrack_helper_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_tstamp_pernet_fini(net);
nf_conntrack_acct_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
- kfree(net->ct.slabname);
free_percpu(net->ct.stat);
free_percpu(net->ct.pcpu_lists);
}
@@ -1606,7 +1642,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
local_bh_disable();
nf_conntrack_all_lock();
- write_seqcount_begin(&init_net.ct.generation);
+ write_seqcount_begin(&nf_conntrack_generation);
/* Lookups in the old hash might happen in parallel, which means we
* might get false negatives during connection lookup. New connections
@@ -1614,26 +1650,28 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
* though since that required taking the locks.
*/
- for (i = 0; i < init_net.ct.htable_size; i++) {
- while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
- h = hlist_nulls_entry(init_net.ct.hash[i].first,
- struct nf_conntrack_tuple_hash, hnnode);
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
+ while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
+ h = hlist_nulls_entry(nf_conntrack_hash[i].first,
+ struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
- bucket = __hash_conntrack(&h->tuple, hashsize);
+ bucket = __hash_conntrack(nf_ct_net(ct),
+ &h->tuple, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
- old_size = init_net.ct.htable_size;
- old_hash = init_net.ct.hash;
+ old_size = nf_conntrack_htable_size;
+ old_hash = nf_conntrack_hash;
- init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
- init_net.ct.hash = hash;
+ nf_conntrack_hash = hash;
+ nf_conntrack_htable_size = hashsize;
- write_seqcount_end(&init_net.ct.generation);
+ write_seqcount_end(&nf_conntrack_generation);
nf_conntrack_all_unlock();
local_bh_enable();
+ synchronize_net();
nf_ct_free_hashtable(old_hash, old_size);
return 0;
}
@@ -1654,7 +1692,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
int nf_conntrack_init_start(void)
{
int max_factor = 8;
- int i, ret, cpu;
+ int ret = -ENOMEM;
+ int i, cpu;
+
+ seqcount_init(&nf_conntrack_generation);
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_conntrack_locks[i]);
@@ -1681,8 +1722,19 @@ int nf_conntrack_init_start(void)
* entries. */
max_factor = 4;
}
+
+ nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
+ if (!nf_conntrack_hash)
+ return -ENOMEM;
+
nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+ nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
+ sizeof(struct nf_conn), 0,
+ SLAB_DESTROY_BY_RCU, NULL);
+ if (!nf_conntrack_cachep)
+ goto err_cachep;
+
printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
nf_conntrack_max);
@@ -1759,6 +1811,9 @@ err_tstamp:
err_acct:
nf_conntrack_expect_fini();
err_expect:
+ kmem_cache_destroy(nf_conntrack_cachep);
+err_cachep:
+ nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
return ret;
}
@@ -1782,7 +1837,6 @@ int nf_conntrack_init_net(struct net *net)
int cpu;
atomic_set(&net->ct.count, 0);
- seqcount_init(&net->ct.generation);
net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
if (!net->ct.pcpu_lists)
@@ -1800,24 +1854,6 @@ int nf_conntrack_init_net(struct net *net)
if (!net->ct.stat)
goto err_pcpu_lists;
- net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
- if (!net->ct.slabname)
- goto err_slabname;
-
- net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
- sizeof(struct nf_conn), 0,
- SLAB_DESTROY_BY_RCU, NULL);
- if (!net->ct.nf_conntrack_cachep) {
- printk(KERN_ERR "Unable to create nf_conn slab cache\n");
- goto err_cache;
- }
-
- net->ct.htable_size = nf_conntrack_htable_size;
- net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
- if (!net->ct.hash) {
- printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
- goto err_hash;
- }
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
goto err_expect;
@@ -1849,12 +1885,6 @@ err_tstamp:
err_acct:
nf_conntrack_expect_pernet_fini(net);
err_expect:
- nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
-err_hash:
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
-err_cache:
- kfree(net->ct.slabname);
-err_slabname:
free_percpu(net->ct.stat);
err_pcpu_lists:
free_percpu(net->ct.pcpu_lists);
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 4e78c57b818f..d28011b42845 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -113,6 +113,60 @@ static void ecache_work(struct work_struct *work)
schedule_delayed_work(&ctnet->ecache_dwork, delay);
}
+int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
+ u32 portid, int report)
+{
+ int ret = 0;
+ struct net *net = nf_ct_net(ct);
+ struct nf_ct_event_notifier *notify;
+ struct nf_conntrack_ecache *e;
+
+ rcu_read_lock();
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
+ if (!notify)
+ goto out_unlock;
+
+ e = nf_ct_ecache_find(ct);
+ if (!e)
+ goto out_unlock;
+
+ if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) {
+ struct nf_ct_event item = {
+ .ct = ct,
+ .portid = e->portid ? e->portid : portid,
+ .report = report
+ };
+ /* This is a resent of a destroy event? If so, skip missed */
+ unsigned long missed = e->portid ? 0 : e->missed;
+
+ if (!((eventmask | missed) & e->ctmask))
+ goto out_unlock;
+
+ ret = notify->fcn(eventmask | missed, &item);
+ if (unlikely(ret < 0 || missed)) {
+ spin_lock_bh(&ct->lock);
+ if (ret < 0) {
+ /* This is a destroy event that has been
+ * triggered by a process, we store the PORTID
+ * to include it in the retransmission.
+ */
+ if (eventmask & (1 << IPCT_DESTROY) &&
+ e->portid == 0 && portid != 0)
+ e->portid = portid;
+ else
+ e->missed |= eventmask;
+ } else {
+ e->missed &= ~missed;
+ }
+ spin_unlock_bh(&ct->lock);
+ }
+ }
+out_unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);
+
/* deliver cached events and clear cache entry - must be called with locally
* disabled softirqs */
void nf_ct_deliver_cached_events(struct nf_conn *ct)
@@ -167,6 +221,36 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
+void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
+ struct nf_conntrack_expect *exp,
+ u32 portid, int report)
+
+{
+ struct net *net = nf_ct_exp_net(exp);
+ struct nf_exp_event_notifier *notify;
+ struct nf_conntrack_ecache *e;
+
+ rcu_read_lock();
+ notify = rcu_dereference(net->ct.nf_expect_event_cb);
+ if (!notify)
+ goto out_unlock;
+
+ e = nf_ct_ecache_find(exp->master);
+ if (!e)
+ goto out_unlock;
+
+ if (e->expmask & (1 << event)) {
+ struct nf_exp_event item = {
+ .exp = exp,
+ .portid = portid,
+ .report = report
+ };
+ notify->fcn(1 << event, &item);
+ }
+out_unlock:
+ rcu_read_unlock();
+}
+
int nf_conntrack_register_notifier(struct net *net,
struct nf_ct_event_notifier *new)
{
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 278927ab0948..9e3693128313 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -24,6 +24,7 @@
#include <linux/moduleparam.h>
#include <linux/export.h>
#include <net/net_namespace.h>
+#include <net/netns/hash.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -35,9 +36,13 @@
unsigned int nf_ct_expect_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
+struct hlist_head *nf_ct_expect_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
+
unsigned int nf_ct_expect_max __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
+static unsigned int nf_ct_expect_hashrnd __read_mostly;
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
@@ -72,21 +77,32 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect)
nf_ct_expect_put(exp);
}
-static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
+static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
- unsigned int hash;
+ unsigned int hash, seed;
- if (unlikely(!nf_conntrack_hash_rnd)) {
- init_nf_conntrack_hash_rnd();
- }
+ get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd));
+
+ seed = nf_ct_expect_hashrnd ^ net_hash_mix(n);
hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
(((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
- (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
+ (__force __u16)tuple->dst.u.all) ^ seed);
return reciprocal_scale(hash, nf_ct_expect_hsize);
}
+static bool
+nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_expect *i,
+ const struct nf_conntrack_zone *zone,
+ const struct net *net)
+{
+ return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
+ net_eq(net, nf_ct_net(i->master)) &&
+ nf_ct_zone_equal_any(i->master, zone);
+}
+
struct nf_conntrack_expect *
__nf_ct_expect_find(struct net *net,
const struct nf_conntrack_zone *zone,
@@ -98,10 +114,9 @@ __nf_ct_expect_find(struct net *net,
if (!net->ct.expect_count)
return NULL;
- h = nf_ct_expect_dst_hash(tuple);
- hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
- if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
- nf_ct_zone_equal_any(i->master, zone))
+ h = nf_ct_expect_dst_hash(net, tuple);
+ hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) {
+ if (nf_ct_exp_equal(tuple, i, zone, net))
return i;
}
return NULL;
@@ -139,11 +154,10 @@ nf_ct_find_expectation(struct net *net,
if (!net->ct.expect_count)
return NULL;
- h = nf_ct_expect_dst_hash(tuple);
- hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
+ h = nf_ct_expect_dst_hash(net, tuple);
+ hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) {
if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
- nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
- nf_ct_zone_equal_any(i->master, zone)) {
+ nf_ct_exp_equal(tuple, i, zone, net)) {
exp = i;
break;
}
@@ -223,6 +237,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
}
return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
+ net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
@@ -232,6 +247,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,
return a->master == b->master && a->class == b->class &&
nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
+ net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
@@ -342,7 +358,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
struct nf_conn_help *master_help = nfct_help(exp->master);
struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(exp);
- unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
+ unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple);
/* two references : one for hash insert, one for the timer */
atomic_add(2, &exp->use);
@@ -350,7 +366,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
hlist_add_head(&exp->lnode, &master_help->expectations);
master_help->expecting[exp->class]++;
- hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
+ hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
net->ct.expect_count++;
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
@@ -401,8 +417,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
ret = -ESHUTDOWN;
goto out;
}
- h = nf_ct_expect_dst_hash(&expect->tuple);
- hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {
+ h = nf_ct_expect_dst_hash(net, &expect->tuple);
+ hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) {
if (expect_matches(i, expect)) {
if (del_timer(&i->timeout)) {
nf_ct_unlink_expect(i);
@@ -468,12 +484,11 @@ struct ct_expect_iter_state {
static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
{
- struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
if (n)
return n;
}
@@ -483,14 +498,13 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct hlist_node *head)
{
- struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
}
return head;
}
@@ -623,28 +637,13 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
int nf_conntrack_expect_pernet_init(struct net *net)
{
- int err = -ENOMEM;
-
net->ct.expect_count = 0;
- net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
- if (net->ct.expect_hash == NULL)
- goto err1;
-
- err = exp_proc_init(net);
- if (err < 0)
- goto err2;
-
- return 0;
-err2:
- nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
-err1:
- return err;
+ return exp_proc_init(net);
}
void nf_conntrack_expect_pernet_fini(struct net *net)
{
exp_proc_remove(net);
- nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
}
int nf_conntrack_expect_init(void)
@@ -660,6 +659,13 @@ int nf_conntrack_expect_init(void)
0, 0, NULL);
if (!nf_ct_expect_cachep)
return -ENOMEM;
+
+ nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
+ if (!nf_ct_expect_hash) {
+ kmem_cache_destroy(nf_ct_expect_cachep);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -667,4 +673,5 @@ void nf_conntrack_expect_fini(void)
{
rcu_barrier(); /* Wait for call_rcu() before destroy */
kmem_cache_destroy(nf_ct_expect_cachep);
+ nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize);
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 48de9be45a9a..196cb39649e1 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -38,10 +38,10 @@ unsigned int nf_ct_helper_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
static unsigned int nf_ct_helper_count __read_mostly;
-static bool nf_ct_auto_assign_helper __read_mostly = true;
+static bool nf_ct_auto_assign_helper __read_mostly = false;
module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
MODULE_PARM_DESC(nf_conntrack_helper,
- "Enable automatic conntrack helper assignment (default 1)");
+ "Enable automatic conntrack helper assignment (default 0)");
#ifdef CONFIG_SYSCTL
static struct ctl_table helper_sysctl_table[] = {
@@ -399,7 +399,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
- &net->ct.expect_hash[i], hnode) {
+ &nf_ct_expect_hash[i], hnode) {
struct nf_conn_help *help = nfct_help(exp->master);
if ((rcu_dereference_protected(
help->helper,
@@ -423,10 +423,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
spin_unlock_bh(&pcpu->lock);
}
local_bh_disable();
- for (i = 0; i < net->ct.htable_size; i++) {
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
- if (i < net->ct.htable_size) {
- hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+ if (i < nf_conntrack_htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
unhelp(h, me);
}
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 3ce5c314ea4b..252e6a7cd2f1 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -16,28 +16,11 @@
static spinlock_t nf_connlabels_lock;
-static unsigned int label_bits(const struct nf_conn_labels *l)
-{
- unsigned int longs = l->words;
- return longs * BITS_PER_LONG;
-}
-
-bool nf_connlabel_match(const struct nf_conn *ct, u16 bit)
-{
- struct nf_conn_labels *labels = nf_ct_labels_find(ct);
-
- if (!labels)
- return false;
-
- return bit < label_bits(labels) && test_bit(bit, labels->bits);
-}
-EXPORT_SYMBOL_GPL(nf_connlabel_match);
-
int nf_connlabel_set(struct nf_conn *ct, u16 bit)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
- if (!labels || bit >= label_bits(labels))
+ if (!labels || BIT_WORD(bit) >= labels->words)
return -ENOSPC;
if (test_bit(bit, labels->bits))
@@ -50,14 +33,18 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit)
}
EXPORT_SYMBOL_GPL(nf_connlabel_set);
-static void replace_u32(u32 *address, u32 mask, u32 new)
+static int replace_u32(u32 *address, u32 mask, u32 new)
{
u32 old, tmp;
do {
old = *address;
tmp = (old & mask) ^ new;
+ if (old == tmp)
+ return 0;
} while (cmpxchg(address, old, tmp) != old);
+
+ return 1;
}
int nf_connlabels_replace(struct nf_conn *ct,
@@ -66,6 +53,7 @@ int nf_connlabels_replace(struct nf_conn *ct,
{
struct nf_conn_labels *labels;
unsigned int size, i;
+ int changed = 0;
u32 *dst;
labels = nf_ct_labels_find(ct);
@@ -77,29 +65,27 @@ int nf_connlabels_replace(struct nf_conn *ct,
words32 = size / sizeof(u32);
dst = (u32 *) labels->bits;
- if (words32) {
- for (i = 0; i < words32; i++)
- replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]);
- }
+ for (i = 0; i < words32; i++)
+ changed |= replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]);
size /= sizeof(u32);
for (i = words32; i < size; i++) /* pad */
replace_u32(&dst[i], 0, 0);
- nf_conntrack_event_cache(IPCT_LABEL, ct);
+ if (changed)
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
return 0;
}
EXPORT_SYMBOL_GPL(nf_connlabels_replace);
-int nf_connlabels_get(struct net *net, unsigned int n_bits)
+int nf_connlabels_get(struct net *net, unsigned int bits)
{
size_t words;
- if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE))
+ words = BIT_WORD(bits) + 1;
+ if (words > NF_CT_LABELS_MAX_SIZE / sizeof(long))
return -ERANGE;
- words = BITS_TO_LONGS(n_bits);
-
spin_lock(&nf_connlabels_lock);
net->ct.labels_used++;
if (words > net->ct.label_words)
@@ -128,6 +114,8 @@ static struct nf_ct_ext_type labels_extend __read_mostly = {
int nf_conntrack_labels_init(void)
{
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
+
spin_lock_init(&nf_connlabels_lock);
return nf_ct_extend_register(&labels_extend);
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 355e8552fd5b..a18d1ceabad5 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -58,10 +58,9 @@ MODULE_LICENSE("GPL");
static char __initdata version[] = "0.93";
-static inline int
-ctnetlink_dump_tuples_proto(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple,
- struct nf_conntrack_l4proto *l4proto)
+static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l4proto *l4proto)
{
int ret = 0;
struct nlattr *nest_parms;
@@ -83,10 +82,9 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_tuples_ip(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple,
- struct nf_conntrack_l3proto *l3proto)
+static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l3proto *l3proto)
{
int ret = 0;
struct nlattr *nest_parms;
@@ -106,9 +104,8 @@ nla_put_failure:
return -1;
}
-static int
-ctnetlink_dump_tuples(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
+static int ctnetlink_dump_tuples(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
{
int ret;
struct nf_conntrack_l3proto *l3proto;
@@ -127,9 +124,8 @@ ctnetlink_dump_tuples(struct sk_buff *skb,
return ret;
}
-static inline int
-ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype,
- const struct nf_conntrack_zone *zone, int dir)
+static int ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype,
+ const struct nf_conntrack_zone *zone, int dir)
{
if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir)
return 0;
@@ -141,8 +137,7 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status)))
goto nla_put_failure;
@@ -152,8 +147,7 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
{
long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;
@@ -168,8 +162,7 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
+static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
{
struct nf_conntrack_l4proto *l4proto;
struct nlattr *nest_proto;
@@ -193,8 +186,8 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
+ const struct nf_conn *ct)
{
struct nlattr *nest_helper;
const struct nf_conn_help *help = nfct_help(ct);
@@ -245,8 +238,10 @@ dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct,
if (!nest_count)
goto nla_put_failure;
- if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) ||
- nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes)))
+ if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts),
+ CTA_COUNTERS_PAD) ||
+ nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes),
+ CTA_COUNTERS_PAD))
goto nla_put_failure;
nla_nest_end(skb, nest_count);
@@ -287,9 +282,11 @@ ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
if (!nest_count)
goto nla_put_failure;
- if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) ||
+ if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start),
+ CTA_TIMESTAMP_PAD) ||
(tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP,
- cpu_to_be64(tstamp->stop))))
+ cpu_to_be64(tstamp->stop),
+ CTA_TIMESTAMP_PAD)))
goto nla_put_failure;
nla_nest_end(skb, nest_count);
@@ -300,8 +297,7 @@ nla_put_failure:
}
#ifdef CONFIG_NF_CONNTRACK_MARK
-static inline int
-ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark)))
goto nla_put_failure;
@@ -315,8 +311,7 @@ nla_put_failure:
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
-static inline int
-ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_secctx;
int len, ret;
@@ -345,7 +340,7 @@ nla_put_failure:
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
-static int ctnetlink_label_size(const struct nf_conn *ct)
+static inline int ctnetlink_label_size(const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
@@ -380,8 +375,7 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
-static inline int
-ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_parms;
@@ -426,8 +420,8 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb,
+ const struct nf_conn *ct)
{
struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
struct nf_ct_seqadj *seq;
@@ -446,8 +440,7 @@ ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
return 0;
}
-static inline int
-ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct)))
goto nla_put_failure;
@@ -457,8 +450,7 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))))
goto nla_put_failure;
@@ -538,8 +530,7 @@ nla_put_failure:
return -1;
}
-static inline size_t
-ctnetlink_proto_size(const struct nf_conn *ct)
+static inline size_t ctnetlink_proto_size(const struct nf_conn *ct)
{
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
@@ -556,19 +547,17 @@ ctnetlink_proto_size(const struct nf_conn *ct)
return len;
}
-static inline size_t
-ctnetlink_acct_size(const struct nf_conn *ct)
+static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
{
if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))
return 0;
return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */
- + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */
- + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */
+ + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */
+ + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */
;
}
-static inline int
-ctnetlink_secctx_size(const struct nf_conn *ct)
+static inline int ctnetlink_secctx_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_SECMARK
int len, ret;
@@ -584,20 +573,19 @@ ctnetlink_secctx_size(const struct nf_conn *ct)
#endif
}
-static inline size_t
-ctnetlink_timestamp_size(const struct nf_conn *ct)
+static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
return 0;
- return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
+ return nla_total_size(0) + 2 * nla_total_size_64bit(sizeof(uint64_t));
#else
return 0;
#endif
}
-static inline size_t
-ctnetlink_nlmsg_size(const struct nf_conn *ct)
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
{
return NLMSG_ALIGN(sizeof(struct nfgenmsg))
+ 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
@@ -628,7 +616,6 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
;
}
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int
ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
{
@@ -837,19 +824,22 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
last = (struct nf_conn *)cb->args[1];
local_bh_disable();
- for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
+ for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
restart:
lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
- if (cb->args[0] >= net->ct.htable_size) {
+ if (cb->args[0] >= nf_conntrack_htable_size) {
spin_unlock(lockp);
goto out;
}
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
- hnnode) {
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
+ hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
+ if (!net_eq(net, nf_ct_net(ct)))
+ continue;
+
/* Dump entries of a given L3 protocol number.
* If it is not specified, ie. l3proto == 0,
* then dump everything. */
@@ -891,8 +881,8 @@ out:
return skb->len;
}
-static inline int
-ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple)
+static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
+ struct nf_conntrack_tuple *tuple)
{
struct nlattr *tb[CTA_IP_MAX+1];
struct nf_conntrack_l3proto *l3proto;
@@ -921,9 +911,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
[CTA_PROTO_NUM] = { .type = NLA_U8 },
};
-static inline int
-ctnetlink_parse_tuple_proto(struct nlattr *attr,
- struct nf_conntrack_tuple *tuple)
+static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
+ struct nf_conntrack_tuple *tuple)
{
struct nlattr *tb[CTA_PROTO_MAX+1];
struct nf_conntrack_l4proto *l4proto;
@@ -1050,9 +1039,8 @@ static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
.len = NF_CT_HELPER_NAME_LEN - 1 },
};
-static inline int
-ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
- struct nlattr **helpinfo)
+static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
+ struct nlattr **helpinfo)
{
int err;
struct nlattr *tb[CTA_HELP_MAX+1];
@@ -1463,8 +1451,8 @@ ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
#endif
}
-static inline int
-ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
+static int ctnetlink_change_helper(struct nf_conn *ct,
+ const struct nlattr * const cda[])
{
struct nf_conntrack_helper *helper;
struct nf_conn_help *help = nfct_help(ct);
@@ -1524,8 +1512,8 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
return -EOPNOTSUPP;
}
-static inline int
-ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[])
+static int ctnetlink_change_timeout(struct nf_conn *ct,
+ const struct nlattr * const cda[])
{
u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
@@ -1544,8 +1532,8 @@ static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
[CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED },
};
-static inline int
-ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[])
+static int ctnetlink_change_protoinfo(struct nf_conn *ct,
+ const struct nlattr * const cda[])
{
const struct nlattr *attr = cda[CTA_PROTOINFO];
struct nlattr *tb[CTA_PROTOINFO_MAX+1];
@@ -1571,8 +1559,8 @@ static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = {
[CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 },
};
-static inline int
-change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr)
+static int change_seq_adj(struct nf_ct_seqadj *seq,
+ const struct nlattr * const attr)
{
int err;
struct nlattr *cda[CTA_SEQADJ_MAX+1];
@@ -2405,10 +2393,9 @@ static struct nfnl_ct_hook ctnetlink_glue_hook = {
* EXPECT
***********************************************************************/
-static inline int
-ctnetlink_exp_dump_tuple(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple,
- enum ctattr_expect type)
+static int ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ enum ctattr_expect type)
{
struct nlattr *nest_parms;
@@ -2425,10 +2412,9 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_exp_dump_mask(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple_mask *mask)
+static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple_mask *mask)
{
int ret;
struct nf_conntrack_l3proto *l3proto;
@@ -2646,10 +2632,14 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
last = (struct nf_conntrack_expect *)cb->args[1];
for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
- hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]],
+ hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]],
hnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
+
+ if (!net_eq(nf_ct_net(exp->master), net))
+ continue;
+
if (cb->args[1]) {
if (exp != last)
continue;
@@ -2900,8 +2890,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
- &net->ct.expect_hash[i],
+ &nf_ct_expect_hash[i],
hnode) {
+
+ if (!net_eq(nf_ct_exp_net(exp), net))
+ continue;
+
m_help = nfct_help(exp->master);
if (!strcmp(m_help->helper->name, name) &&
del_timer(&exp->timeout)) {
@@ -2918,8 +2912,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
- &net->ct.expect_hash[i],
+ &nf_ct_expect_hash[i],
hnode) {
+
+ if (!net_eq(nf_ct_exp_net(exp), net))
+ continue;
+
if (del_timer(&exp->timeout)) {
nf_ct_unlink_expect_report(exp,
NETLINK_CB(skb).portid,
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index fce1b1cca32d..399a38fd685a 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -645,7 +645,8 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
- cpu_to_be64(ct->proto.dccp.handshake_seq)))
+ cpu_to_be64(ct->proto.dccp.handshake_seq),
+ CTA_PROTOINFO_DCCP_PAD))
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
spin_unlock_bh(&ct->lock);
@@ -660,6 +661,7 @@ static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
[CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 },
[CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 },
[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 },
+ [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC },
};
static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 9578a7c371ef..1d7ab960a9e6 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -191,13 +191,7 @@ static void sctp_print_tuple(struct seq_file *s,
/* Print out the private part of the conntrack. */
static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
- enum sctp_conntrack state;
-
- spin_lock_bh(&ct->lock);
- state = ct->proto.sctp.state;
- spin_unlock_bh(&ct->lock);
-
- seq_printf(s, "%s ", sctp_conntrack_names[state]);
+ seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]);
}
#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 7cc1d9c22a9f..70c8381641a7 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -313,13 +313,7 @@ static void tcp_print_tuple(struct seq_file *s,
/* Print out the private part of the conntrack. */
static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
- enum tcp_conntrack state;
-
- spin_lock_bh(&ct->lock);
- state = ct->proto.tcp.state;
- spin_unlock_bh(&ct->lock);
-
- seq_printf(s, "%s ", tcp_conntrack_names[state]);
+ seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
}
static unsigned int get_conntrack_index(const struct tcphdr *tcph)
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 478f92f834b6..4fd040575ffe 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -309,6 +309,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDP,
.name = "udp",
+ .allow_clash = true,
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
@@ -341,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDP,
.name = "udp",
+ .allow_clash = true,
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 1ac8ee13a873..9d692f5adb94 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -274,6 +274,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDPLITE,
.name = "udplite",
+ .allow_clash = true,
.pkt_to_tuple = udplite_pkt_to_tuple,
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
@@ -306,6 +307,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDPLITE,
.name = "udplite",
+ .allow_clash = true,
.pkt_to_tuple = udplite_pkt_to_tuple,
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2933db30098a..c026c472ea80 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -54,14 +54,13 @@ struct ct_iter_state {
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
- struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
struct hlist_nulls_node *n;
for (st->bucket = 0;
- st->bucket < net->ct.htable_size;
+ st->bucket < nf_conntrack_htable_size;
st->bucket++) {
- n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct hlist_nulls_node *head)
{
- struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= net->ct.htable_size)
+ if (++st->bucket >= nf_conntrack_htable_size)
return NULL;
}
head = rcu_dereference(
hlist_nulls_first_rcu(
- &net->ct.hash[st->bucket]));
+ &nf_conntrack_hash[st->bucket]));
}
return head;
}
@@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
},
{
.procname = "nf_conntrack_buckets",
- .data = &init_net.ct.htable_size,
+ .data = &nf_conntrack_htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
.proc_handler = proc_dointvec,
@@ -510,7 +508,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
goto out_kmemdup;
table[1].data = &net->ct.count;
- table[2].data = &net->ct.htable_size;
table[3].data = &net->ct.sysctl_checksum;
table[4].data = &net->ct.sysctl_log_invalid;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 06a9f45771ab..6877a396f8fc 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -38,6 +38,9 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly;
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
@@ -118,15 +121,17 @@ EXPORT_SYMBOL(nf_xfrm_me_harder);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
-hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
+ get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
+
/* Original src, to ensure we map it consistently if poss. */
hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
- tuple->dst.protonum ^ nf_conntrack_hash_rnd);
+ tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
- return reciprocal_scale(hash, net->ct.nat_htable_size);
+ return reciprocal_scale(hash, nf_nat_htable_size);
}
/* Is this tuple already taken? (not by us) */
@@ -196,9 +201,10 @@ find_appropriate_src(struct net *net,
const struct nf_conn_nat *nat;
const struct nf_conn *ct;
- hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
+ hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) {
ct = nat->ct;
if (same_src(ct, tuple) &&
+ net_eq(net, nf_ct_net(ct)) &&
nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
/* Copy source part from reply tuple. */
nf_ct_invert_tuplepr(result,
@@ -431,7 +437,7 @@ nf_nat_setup_info(struct nf_conn *ct,
nat = nfct_nat(ct);
nat->ct = ct;
hlist_add_head_rcu(&nat->bysource,
- &net->ct.nat_bysource[srchash]);
+ &nf_nat_bysource[srchash]);
spin_unlock_bh(&nf_nat_lock);
}
@@ -819,27 +825,14 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
}
#endif
-static int __net_init nf_nat_net_init(struct net *net)
-{
- /* Leave them the same for the moment. */
- net->ct.nat_htable_size = net->ct.htable_size;
- net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
- if (!net->ct.nat_bysource)
- return -ENOMEM;
- return 0;
-}
-
static void __net_exit nf_nat_net_exit(struct net *net)
{
struct nf_nat_proto_clean clean = {};
nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);
- synchronize_rcu();
- nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
}
static struct pernet_operations nf_nat_net_ops = {
- .init = nf_nat_net_init,
.exit = nf_nat_net_exit,
};
@@ -852,8 +845,16 @@ static int __init nf_nat_init(void)
{
int ret;
+ /* Leave them the same for the moment. */
+ nf_nat_htable_size = nf_conntrack_htable_size;
+
+ nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+ if (!nf_nat_bysource)
+ return -ENOMEM;
+
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret;
}
@@ -877,6 +878,7 @@ static int __init nf_nat_init(void)
return 0;
cleanup_extend:
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
nf_ct_extend_unregister(&nat_extend);
return ret;
}
@@ -895,6 +897,7 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
synchronize_net();
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
}
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6947e255cdd8..7b7aa871a174 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -944,8 +944,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) ||
- nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)))
+ if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts),
+ NFTA_COUNTER_PAD) ||
+ nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes),
+ NFTA_COUNTER_PAD))
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -975,7 +977,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle)))
+ if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
+ NFTA_CHAIN_PAD))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name))
goto nla_put_failure;
@@ -1803,13 +1806,15 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
goto nla_put_failure;
if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle)))
+ if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle),
+ NFTA_RULE_PAD))
goto nla_put_failure;
if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) {
prule = list_entry(rule->list.prev, struct nft_rule, list);
if (nla_put_be64(skb, NFTA_RULE_POSITION,
- cpu_to_be64(prule->handle)))
+ cpu_to_be64(prule->handle),
+ NFTA_RULE_PAD))
goto nla_put_failure;
}
@@ -2312,7 +2317,7 @@ nft_select_set_ops(const struct nlattr * const nla[],
static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_TABLE] = { .type = NLA_STRING },
[NFTA_SET_NAME] = { .type = NLA_STRING,
- .len = IFNAMSIZ - 1 },
+ .len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_SET_FLAGS] = { .type = NLA_U32 },
[NFTA_SET_KEY_TYPE] = { .type = NLA_U32 },
[NFTA_SET_KEY_LEN] = { .type = NLA_U32 },
@@ -2396,7 +2401,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
unsigned long *inuse;
unsigned int n = 0, min = 0;
- p = strnchr(name, IFNAMSIZ, '%');
+ p = strnchr(name, NFT_SET_MAXNAMELEN, '%');
if (p != NULL) {
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
@@ -2473,7 +2478,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
}
if (set->timeout &&
- nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout)))
+ nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout),
+ NFTA_SET_PAD))
goto nla_put_failure;
if (set->gc_int &&
nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int)))
@@ -2692,7 +2698,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
- char name[IFNAMSIZ];
+ char name[NFT_SET_MAXNAMELEN];
unsigned int size;
bool create;
u64 timeout;
@@ -3078,7 +3084,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
- cpu_to_be64(*nft_set_ext_timeout(ext))))
+ cpu_to_be64(*nft_set_ext_timeout(ext)),
+ NFTA_SET_ELEM_PAD))
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
@@ -3091,7 +3098,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
expires = 0;
if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
- cpu_to_be64(jiffies_to_msecs(expires))))
+ cpu_to_be64(jiffies_to_msecs(expires)),
+ NFTA_SET_ELEM_PAD))
goto nla_put_failure;
}
@@ -3369,6 +3377,22 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem)
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
+static int nft_setelem_parse_flags(const struct nft_set *set,
+ const struct nlattr *attr, u32 *flags)
+{
+ if (attr == NULL)
+ return 0;
+
+ *flags = ntohl(nla_get_be32(attr));
+ if (*flags & ~NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+ if (!(set->flags & NFT_SET_INTERVAL) &&
+ *flags & NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+
+ return 0;
+}
+
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
@@ -3382,8 +3406,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data data;
enum nft_registers dreg;
struct nft_trans *trans;
+ u32 flags = 0;
u64 timeout;
- u32 flags;
u8 ulen;
int err;
@@ -3397,17 +3421,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_prepare(&tmpl);
- flags = 0;
- if (nla[NFTA_SET_ELEM_FLAGS] != NULL) {
- flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS]));
- if (flags & ~NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
- if (!(set->flags & NFT_SET_INTERVAL) &&
- flags & NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
- }
+ err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
+ if (err < 0)
+ return err;
+ if (flags != 0)
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
if (set->flags & NFT_SET_MAP) {
if (nla[NFTA_SET_ELEM_DATA] == NULL &&
@@ -3576,9 +3594,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ struct nft_set_ext_tmpl tmpl;
struct nft_data_desc desc;
struct nft_set_elem elem;
+ struct nft_set_ext *ext;
struct nft_trans *trans;
+ u32 flags = 0;
+ void *priv;
int err;
err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
@@ -3590,6 +3612,14 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_KEY] == NULL)
goto err1;
+ nft_set_ext_prepare(&tmpl);
+
+ err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
+ if (err < 0)
+ return err;
+ if (flags != 0)
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+
err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
@@ -3599,24 +3629,40 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
goto err2;
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len);
+
+ err = -ENOMEM;
+ elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0,
+ GFP_KERNEL);
+ if (elem.priv == NULL)
+ goto err2;
+
+ ext = nft_set_elem_ext(set, elem.priv);
+ if (flags)
+ *nft_set_ext_flags(ext) = flags;
+
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
if (trans == NULL) {
err = -ENOMEM;
- goto err2;
+ goto err3;
}
- elem.priv = set->ops->deactivate(set, &elem);
- if (elem.priv == NULL) {
+ priv = set->ops->deactivate(set, &elem);
+ if (priv == NULL) {
err = -ENOENT;
- goto err3;
+ goto err4;
}
+ kfree(elem.priv);
+ elem.priv = priv;
nft_trans_elem(trans) = elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
-err3:
+err4:
kfree(trans);
+err3:
+ kfree(elem.priv);
err2:
nft_data_uninit(&elem.key.val, desc.type);
err1:
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index e9e959f65d91..39eb1cc62e91 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -156,7 +156,8 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
return 0;
return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE,
- cpu_to_be64(info->rule->handle));
+ cpu_to_be64(info->rule->handle),
+ NFTA_TRACE_PAD);
}
void nft_trace_notify(struct nft_traceinfo *info)
@@ -174,7 +175,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
nla_total_size(NFT_TABLE_MAXNAMELEN) +
nla_total_size(NFT_CHAIN_MAXNAMELEN) +
- nla_total_size(sizeof(__be64)) + /* rule handle */
+ nla_total_size_64bit(sizeof(__be64)) + /* rule handle */
nla_total_size(sizeof(__be32)) + /* trace type */
nla_total_size(0) + /* VERDICT, nested */
nla_total_size(sizeof(u32)) + /* verdict code */
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index dbd0803b1827..1b4de4bd6958 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -162,15 +162,18 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
pkts = atomic64_read(&acct->pkts);
bytes = atomic64_read(&acct->bytes);
}
- if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts)) ||
- nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) ||
+ if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts),
+ NFACCT_PAD) ||
+ nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes),
+ NFACCT_PAD) ||
nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt))))
goto nla_put_failure;
if (acct->flags & NFACCT_F_QUOTA) {
u64 *quota = (u64 *)acct->data;
if (nla_put_be32(skb, NFACCT_FLAGS, htonl(old_flags)) ||
- nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota)))
+ nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota),
+ NFACCT_PAD))
goto nla_put_failure;
}
nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 2671b9deb103..3c84f14326f5 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -306,10 +306,10 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
int i;
local_bh_disable();
- for (i = 0; i < net->ct.htable_size; i++) {
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
- if (i < net->ct.htable_size) {
- hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+ if (i < nf_conntrack_htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
untimeout(h, timeout);
}
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 309ac026aac2..5d36a0926b4a 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -295,6 +295,59 @@ static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
return seclen;
}
+static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry)
+{
+ struct sk_buff *entskb = entry->skb;
+ u32 nlalen = 0;
+
+ if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb))
+ return 0;
+
+ if (skb_vlan_tag_present(entskb))
+ nlalen += nla_total_size(nla_total_size(sizeof(__be16)) +
+ nla_total_size(sizeof(__be16)));
+
+ if (entskb->network_header > entskb->mac_header)
+ nlalen += nla_total_size((entskb->network_header -
+ entskb->mac_header));
+
+ return nlalen;
+}
+
+static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb)
+{
+ struct sk_buff *entskb = entry->skb;
+
+ if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb))
+ return 0;
+
+ if (skb_vlan_tag_present(entskb)) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFQA_VLAN | NLA_F_NESTED);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) ||
+ nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ }
+
+ if (entskb->mac_header < entskb->network_header) {
+ int len = (int)(entskb->network_header - entskb->mac_header);
+
+ if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb)))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
static struct sk_buff *
nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct nf_queue_entry *entry,
@@ -334,6 +387,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (entskb->tstamp.tv64)
size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
+ size += nfqnl_get_bridge_size(entry);
+
if (entry->state.hook <= NF_INET_FORWARD ||
(entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL))
csum_verify = !skb_csum_unnecessary(entskb);
@@ -497,6 +552,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
}
+ if (nfqnl_put_bridge(entry, skb) < 0)
+ goto nla_put_failure;
+
if (entskb->tstamp.tv64) {
struct nfqnl_msg_packet_timestamp ts;
struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
@@ -911,12 +969,18 @@ static struct notifier_block nfqnl_rtnl_notifier = {
.notifier_call = nfqnl_rcv_nl_event,
};
+static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = {
+ [NFQA_VLAN_TCI] = { .type = NLA_U16},
+ [NFQA_VLAN_PROTO] = { .type = NLA_U16},
+};
+
static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
[NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
[NFQA_MARK] = { .type = NLA_U32 },
[NFQA_PAYLOAD] = { .type = NLA_UNSPEC },
[NFQA_CT] = { .type = NLA_UNSPEC },
[NFQA_EXP] = { .type = NLA_UNSPEC },
+ [NFQA_VLAN] = { .type = NLA_NESTED },
};
static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
@@ -1030,6 +1094,40 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
return ct;
}
+static int nfqa_parse_bridge(struct nf_queue_entry *entry,
+ const struct nlattr * const nfqa[])
+{
+ if (nfqa[NFQA_VLAN]) {
+ struct nlattr *tb[NFQA_VLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN],
+ nfqa_vlan_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO])
+ return -EINVAL;
+
+ entry->skb->vlan_tci = ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]));
+ entry->skb->vlan_proto = nla_get_be16(tb[NFQA_VLAN_PROTO]);
+ }
+
+ if (nfqa[NFQA_L2HDR]) {
+ int mac_header_len = entry->skb->network_header -
+ entry->skb->mac_header;
+
+ if (mac_header_len != nla_len(nfqa[NFQA_L2HDR]))
+ return -EINVAL;
+ else if (mac_header_len > 0)
+ memcpy(skb_mac_header(entry->skb),
+ nla_data(nfqa[NFQA_L2HDR]),
+ mac_header_len);
+ }
+
+ return 0;
+}
+
static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
struct sk_buff *skb,
const struct nlmsghdr *nlh,
@@ -1045,6 +1143,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
struct nfnl_ct_hook *nfnl_ct;
struct nf_conn *ct = NULL;
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ int err;
queue = instance_lookup(q, queue_num);
if (!queue)
@@ -1071,6 +1170,12 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo);
}
+ if (entry->state.pf == PF_BRIDGE) {
+ err = nfqa_parse_bridge(entry, nfqa);
+ if (err < 0)
+ return err;
+ }
+
if (nfqa[NFQA_PAYLOAD]) {
u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
int diff = payload_len - entry->skb->len;
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index c9743f78f219..77db8358ab14 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -76,8 +76,10 @@ static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
nft_counter_fetch(priv->counter, &total);
- if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) ||
- nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets)))
+ if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes),
+ NFTA_COUNTER_PAD) ||
+ nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets),
+ NFTA_COUNTER_PAD))
goto nla_put_failure;
return 0;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index d4a4619fcebc..137e308d5b24 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -198,6 +198,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
}
break;
#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ nf_connlabels_replace(ct,
+ &regs->data[priv->sreg],
+ &regs->data[priv->sreg],
+ NF_CT_LABELS_MAX_SIZE / sizeof(u32));
+ break;
+#endif
default:
break;
}
@@ -365,6 +373,16 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
len = FIELD_SIZEOF(struct nf_conn, mark);
break;
#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ if (tb[NFTA_CT_DIRECTION])
+ return -EINVAL;
+ len = NF_CT_LABELS_MAX_SIZE;
+ err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
+ if (err)
+ return err;
+ break;
+#endif
default:
return -EOPNOTSUPP;
}
@@ -384,6 +402,18 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
static void nft_ct_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
+ struct nft_ct *priv = nft_expr_priv(expr);
+
+ switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ nf_connlabels_put(ctx->net);
+ break;
+#endif
+ default:
+ break;
+ }
+
nft_ct_l3proto_module_put(ctx->afi->family);
}
@@ -484,6 +514,8 @@ static struct nft_expr_type nft_ct_type __read_mostly = {
static int __init nft_ct_module_init(void)
{
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE);
+
return nft_register_expr(&nft_ct_type);
}
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 9dec3bd1b63c..78d4914fb39c 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -227,7 +227,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name))
goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout)))
+ if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout),
+ NFTA_DYNSET_PAD))
goto nla_put_failure;
if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
goto nla_put_failure;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 3f9d45d3d9b7..6fa016564f90 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -192,7 +192,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
int err;
- err = rhashtable_walk_init(&priv->ht, &hti);
+ err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
iter->err = err;
if (err)
return;
@@ -248,7 +248,7 @@ static void nft_hash_gc(struct work_struct *work)
priv = container_of(work, struct nft_hash, gc_work.work);
set = nft_set_container_of(priv);
- err = rhashtable_walk_init(&priv->ht, &hti);
+ err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
if (err)
goto schedule;
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 99d18578afc6..070b98938e02 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -97,8 +97,10 @@ static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit,
u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
u64 rate = limit->rate - limit->burst;
- if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) ||
- nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) ||
+ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate),
+ NFTA_LIMIT_PAD) ||
+ nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs),
+ NFTA_LIMIT_PAD) ||
nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) ||
nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) ||
nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags)))
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
index 1c30f41cff5b..f762094af7c1 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_rbtree.c
@@ -29,6 +29,17 @@ struct nft_rbtree_elem {
struct nft_set_ext ext;
};
+static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
+{
+ return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
+ (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
+}
+
+static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
+ const struct nft_rbtree_elem *interval)
+{
+ return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
+}
static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
const struct nft_set_ext **ext)
@@ -37,6 +48,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
const struct nft_rbtree_elem *rbe, *interval = NULL;
const struct rb_node *parent;
u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
+ const void *this;
int d;
spin_lock_bh(&nft_rbtree_lock);
@@ -44,9 +56,16 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
while (parent != NULL) {
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
- d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
+ this = nft_set_ext_key(&rbe->ext);
+ d = memcmp(this, key, set->klen);
if (d < 0) {
parent = parent->rb_left;
+ /* In case of adjacent ranges, we always see the high
+ * part of the range in first place, before the low one.
+ * So don't update interval if the keys are equal.
+ */
+ if (interval && nft_rbtree_equal(set, this, interval))
+ continue;
interval = rbe;
} else if (d > 0)
parent = parent->rb_right;
@@ -56,9 +75,7 @@ found:
parent = parent->rb_left;
continue;
}
- if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
- *nft_set_ext_flags(&rbe->ext) &
- NFT_SET_ELEM_INTERVAL_END)
+ if (nft_rbtree_interval_end(rbe))
goto out;
spin_unlock_bh(&nft_rbtree_lock);
@@ -98,9 +115,16 @@ static int __nft_rbtree_insert(const struct nft_set *set,
else if (d > 0)
p = &parent->rb_right;
else {
- if (nft_set_elem_active(&rbe->ext, genmask))
- return -EEXIST;
- p = &parent->rb_left;
+ if (nft_set_elem_active(&rbe->ext, genmask)) {
+ if (nft_rbtree_interval_end(rbe) &&
+ !nft_rbtree_interval_end(new))
+ p = &parent->rb_left;
+ else if (!nft_rbtree_interval_end(rbe) &&
+ nft_rbtree_interval_end(new))
+ p = &parent->rb_right;
+ else
+ return -EEXIST;
+ }
}
}
rb_link_node(&new->node, parent, p);
@@ -145,7 +169,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set,
{
const struct nft_rbtree *priv = nft_set_priv(set);
const struct rb_node *parent = priv->root.rb_node;
- struct nft_rbtree_elem *rbe;
+ struct nft_rbtree_elem *rbe, *this = elem->priv;
u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
int d;
@@ -163,6 +187,15 @@ static void *nft_rbtree_deactivate(const struct nft_set *set,
parent = parent->rb_left;
continue;
}
+ if (nft_rbtree_interval_end(rbe) &&
+ !nft_rbtree_interval_end(this)) {
+ parent = parent->rb_left;
+ continue;
+ } else if (!nft_rbtree_interval_end(rbe) &&
+ nft_rbtree_interval_end(this)) {
+ parent = parent->rb_right;
+ continue;
+ }
nft_set_elem_change_active(set, &rbe->ext);
return rbe;
}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 582c9cfd6567..c69c892231d7 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -416,6 +416,47 @@ int xt_check_match(struct xt_mtchk_param *par,
}
EXPORT_SYMBOL_GPL(xt_check_match);
+/** xt_check_entry_match - check that matches end before start of target
+ *
+ * @match: beginning of xt_entry_match
+ * @target: beginning of this rules target (alleged end of matches)
+ * @alignment: alignment requirement of match structures
+ *
+ * Validates that all matches add up to the beginning of the target,
+ * and that each match covers at least the base structure size.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+static int xt_check_entry_match(const char *match, const char *target,
+ const size_t alignment)
+{
+ const struct xt_entry_match *pos;
+ int length = target - match;
+
+ if (length == 0) /* no matches */
+ return 0;
+
+ pos = (struct xt_entry_match *)match;
+ do {
+ if ((unsigned long)pos % alignment)
+ return -EINVAL;
+
+ if (length < (int)sizeof(struct xt_entry_match))
+ return -EINVAL;
+
+ if (pos->u.match_size < sizeof(struct xt_entry_match))
+ return -EINVAL;
+
+ if (pos->u.match_size > length)
+ return -EINVAL;
+
+ length -= pos->u.match_size;
+ pos = ((void *)((char *)(pos) + (pos)->u.match_size));
+ } while (length > 0);
+
+ return 0;
+}
+
#ifdef CONFIG_COMPAT
int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
{
@@ -485,13 +526,14 @@ int xt_compat_match_offset(const struct xt_match *match)
}
EXPORT_SYMBOL_GPL(xt_compat_match_offset);
-int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
- unsigned int *size)
+void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
+ unsigned int *size)
{
const struct xt_match *match = m->u.kernel.match;
struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m;
int pad, off = xt_compat_match_offset(match);
u_int16_t msize = cm->u.user.match_size;
+ char name[sizeof(m->u.user.name)];
m = *dstptr;
memcpy(m, cm, sizeof(*cm));
@@ -505,10 +547,12 @@ int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
msize += off;
m->u.user.match_size = msize;
+ strlcpy(name, match->name, sizeof(name));
+ module_put(match->me);
+ strncpy(m->u.user.name, name, sizeof(m->u.user.name));
*size += off;
*dstptr += msize;
- return 0;
}
EXPORT_SYMBOL_GPL(xt_compat_match_from_user);
@@ -539,8 +583,125 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
return 0;
}
EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
+
+/* non-compat version may have padding after verdict */
+struct compat_xt_standard_target {
+ struct compat_xt_entry_target t;
+ compat_uint_t verdict;
+};
+
+int xt_compat_check_entry_offsets(const void *base, const char *elems,
+ unsigned int target_offset,
+ unsigned int next_offset)
+{
+ long size_of_base_struct = elems - (const char *)base;
+ const struct compat_xt_entry_target *t;
+ const char *e = base;
+
+ if (target_offset < size_of_base_struct)
+ return -EINVAL;
+
+ if (target_offset + sizeof(*t) > next_offset)
+ return -EINVAL;
+
+ t = (void *)(e + target_offset);
+ if (t->u.target_size < sizeof(*t))
+ return -EINVAL;
+
+ if (target_offset + t->u.target_size > next_offset)
+ return -EINVAL;
+
+ if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
+ target_offset + sizeof(struct compat_xt_standard_target) != next_offset)
+ return -EINVAL;
+
+ /* compat_xt_entry match has less strict aligment requirements,
+ * otherwise they are identical. In case of padding differences
+ * we need to add compat version of xt_check_entry_match.
+ */
+ BUILD_BUG_ON(sizeof(struct compat_xt_entry_match) != sizeof(struct xt_entry_match));
+
+ return xt_check_entry_match(elems, base + target_offset,
+ __alignof__(struct compat_xt_entry_match));
+}
+EXPORT_SYMBOL(xt_compat_check_entry_offsets);
#endif /* CONFIG_COMPAT */
+/**
+ * xt_check_entry_offsets - validate arp/ip/ip6t_entry
+ *
+ * @base: pointer to arp/ip/ip6t_entry
+ * @elems: pointer to first xt_entry_match, i.e. ip(6)t_entry->elems
+ * @target_offset: the arp/ip/ip6_t->target_offset
+ * @next_offset: the arp/ip/ip6_t->next_offset
+ *
+ * validates that target_offset and next_offset are sane and that all
+ * match sizes (if any) align with the target offset.
+ *
+ * This function does not validate the targets or matches themselves, it
+ * only tests that all the offsets and sizes are correct, that all
+ * match structures are aligned, and that the last structure ends where
+ * the target structure begins.
+ *
+ * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
+ *
+ * The arp/ip/ip6t_entry structure @base must have passed following tests:
+ * - it must point to a valid memory location
+ * - base to base + next_offset must be accessible, i.e. not exceed allocated
+ * length.
+ *
+ * A well-formed entry looks like this:
+ *
+ * ip(6)t_entry match [mtdata] match [mtdata] target [tgdata] ip(6)t_entry
+ * e->elems[]-----' | |
+ * matchsize | |
+ * matchsize | |
+ * | |
+ * target_offset---------------------------------' |
+ * next_offset---------------------------------------------------'
+ *
+ * elems[]: flexible array member at end of ip(6)/arpt_entry struct.
+ * This is where matches (if any) and the target reside.
+ * target_offset: beginning of target.
+ * next_offset: start of the next rule; also: size of this rule.
+ * Since targets have a minimum size, target_offset + minlen <= next_offset.
+ *
+ * Every match stores its size, sum of sizes must not exceed target_offset.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int xt_check_entry_offsets(const void *base,
+ const char *elems,
+ unsigned int target_offset,
+ unsigned int next_offset)
+{
+ long size_of_base_struct = elems - (const char *)base;
+ const struct xt_entry_target *t;
+ const char *e = base;
+
+ /* target start is within the ip/ip6/arpt_entry struct */
+ if (target_offset < size_of_base_struct)
+ return -EINVAL;
+
+ if (target_offset + sizeof(*t) > next_offset)
+ return -EINVAL;
+
+ t = (void *)(e + target_offset);
+ if (t->u.target_size < sizeof(*t))
+ return -EINVAL;
+
+ if (target_offset + t->u.target_size > next_offset)
+ return -EINVAL;
+
+ if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
+ target_offset + sizeof(struct xt_standard_target) != next_offset)
+ return -EINVAL;
+
+ return xt_check_entry_match(elems, base + target_offset,
+ __alignof__(struct xt_entry_match));
+}
+EXPORT_SYMBOL(xt_check_entry_offsets);
+
int xt_check_target(struct xt_tgchk_param *par,
unsigned int size, u_int8_t proto, bool inv_proto)
{
@@ -591,6 +752,80 @@ int xt_check_target(struct xt_tgchk_param *par,
}
EXPORT_SYMBOL_GPL(xt_check_target);
+/**
+ * xt_copy_counters_from_user - copy counters and metadata from userspace
+ *
+ * @user: src pointer to userspace memory
+ * @len: alleged size of userspace memory
+ * @info: where to store the xt_counters_info metadata
+ * @compat: true if we setsockopt call is done by 32bit task on 64bit kernel
+ *
+ * Copies counter meta data from @user and stores it in @info.
+ *
+ * vmallocs memory to hold the counters, then copies the counter data
+ * from @user to the new memory and returns a pointer to it.
+ *
+ * If @compat is true, @info gets converted automatically to the 64bit
+ * representation.
+ *
+ * The metadata associated with the counters is stored in @info.
+ *
+ * Return: returns pointer that caller has to test via IS_ERR().
+ * If IS_ERR is false, caller has to vfree the pointer.
+ */
+void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
+ struct xt_counters_info *info, bool compat)
+{
+ void *mem;
+ u64 size;
+
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ /* structures only differ in size due to alignment */
+ struct compat_xt_counters_info compat_tmp;
+
+ if (len <= sizeof(compat_tmp))
+ return ERR_PTR(-EINVAL);
+
+ len -= sizeof(compat_tmp);
+ if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0)
+ return ERR_PTR(-EFAULT);
+
+ strlcpy(info->name, compat_tmp.name, sizeof(info->name));
+ info->num_counters = compat_tmp.num_counters;
+ user += sizeof(compat_tmp);
+ } else
+#endif
+ {
+ if (len <= sizeof(*info))
+ return ERR_PTR(-EINVAL);
+
+ len -= sizeof(*info);
+ if (copy_from_user(info, user, sizeof(*info)) != 0)
+ return ERR_PTR(-EFAULT);
+
+ info->name[sizeof(info->name) - 1] = '\0';
+ user += sizeof(*info);
+ }
+
+ size = sizeof(struct xt_counters);
+ size *= info->num_counters;
+
+ if (size != (u64)len)
+ return ERR_PTR(-EINVAL);
+
+ mem = vmalloc(len);
+ if (!mem)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(mem, user, len) == 0)
+ return mem;
+
+ vfree(mem);
+ return ERR_PTR(-EFAULT);
+}
+EXPORT_SYMBOL_GPL(xt_copy_counters_from_user);
+
#ifdef CONFIG_COMPAT
int xt_compat_target_offset(const struct xt_target *target)
{
@@ -606,6 +841,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t;
int pad, off = xt_compat_target_offset(target);
u_int16_t tsize = ct->u.user.target_size;
+ char name[sizeof(t->u.user.name)];
t = *dstptr;
memcpy(t, ct, sizeof(*ct));
@@ -619,6 +855,9 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
tsize += off;
t->u.user.target_size = tsize;
+ strlcpy(name, target->name, sizeof(name));
+ module_put(target->me);
+ strncpy(t->u.user.name, name, sizeof(t->u.user.name));
*size += off;
*dstptr += tsize;
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index bb9cbeb18868..a79af255561a 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -18,6 +18,16 @@ MODULE_DESCRIPTION("Xtables: add/match connection trackling labels");
MODULE_ALIAS("ipt_connlabel");
MODULE_ALIAS("ip6t_connlabel");
+static bool connlabel_match(const struct nf_conn *ct, u16 bit)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels)
+ return false;
+
+ return BIT_WORD(bit) < labels->words && test_bit(bit, labels->bits);
+}
+
static bool
connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
@@ -33,7 +43,7 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (info->options & XT_CONNLABEL_OP_SET)
return (nf_connlabel_set(ct, info->bit) == 0) ^ invert;
- return nf_connlabel_match(ct, info->bit) ^ invert;
+ return connlabel_match(ct, info->bit) ^ invert;
}
static int connlabel_mt_check(const struct xt_mtchk_param *par)
@@ -55,7 +65,7 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
return ret;
}
- ret = nf_connlabels_get(par->net, info->bit + 1);
+ ret = nf_connlabels_get(par->net, info->bit);
if (ret < 0)
nf_ct_l3proto_module_put(par->family);
return ret;
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 49d14ecad444..b10ade272b50 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -120,9 +120,9 @@ xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return __inet_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
- in->ifindex);
+ return inet_lookup(net, &tcp_hashinfo, skb, doff,
+ saddr, sport, daddr, dport,
+ in->ifindex);
case IPPROTO_UDP:
return udp4_lib_lookup(net, saddr, sport, daddr, dport,
in->ifindex);