summaryrefslogtreecommitdiff
path: root/net/netfilter
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-06-06 18:39:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-06 18:39:49 -0700
commit1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21 (patch)
treedcc97181d4d187252e0cc8fdf29d9b365fa3ffd0 /net/netfilter
parent285767604576148fc1be7fcd112e4a90eb0d6ad2 (diff)
parent7170e6045a6a8b33f4fa5753589dc77b16198e2d (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) Add Maglev hashing scheduler to IPVS, from Inju Song. 2) Lots of new TC subsystem tests from Roman Mashak. 3) Add TCP zero copy receive and fix delayed acks and autotuning with SO_RCVLOWAT, from Eric Dumazet. 4) Add XDP_REDIRECT support to mlx5 driver, from Jesper Dangaard Brouer. 5) Add ttl inherit support to vxlan, from Hangbin Liu. 6) Properly separate ipv6 routes into their logically independant components. fib6_info for the routing table, and fib6_nh for sets of nexthops, which thus can be shared. From David Ahern. 7) Add bpf_xdp_adjust_tail helper, which can be used to generate ICMP messages from XDP programs. From Nikita V. Shirokov. 8) Lots of long overdue cleanups to the r8169 driver, from Heiner Kallweit. 9) Add BTF ("BPF Type Format"), from Martin KaFai Lau. 10) Add traffic condition monitoring to iwlwifi, from Luca Coelho. 11) Plumb extack down into fib_rules, from Roopa Prabhu. 12) Add Flower classifier offload support to igb, from Vinicius Costa Gomes. 13) Add UDP GSO support, from Willem de Bruijn. 14) Add documentation for eBPF helpers, from Quentin Monnet. 15) Add TLS tx offload to mlx5, from Ilya Lesokhin. 16) Allow applications to be given the number of bytes available to read on a socket via a control message returned from recvmsg(), from Soheil Hassas Yeganeh. 17) Add x86_32 eBPF JIT compiler, from Wang YanQing. 18) Add AF_XDP sockets, with zerocopy support infrastructure as well. From Björn Töpel. 19) Remove indirect load support from all of the BPF JITs and handle these operations in the verifier by translating them into native BPF instead. From Daniel Borkmann. 20) Add GRO support to ipv6 gre tunnels, from Eran Ben Elisha. 21) Allow XDP programs to do lookups in the main kernel routing tables for forwarding. From David Ahern. 22) Allow drivers to store hardware state into an ELF section of kernel dump vmcore files, and use it in cxgb4. From Rahul Lakkireddy. 23) Various RACK and loss detection improvements in TCP, from Yuchung Cheng. 24) Add TCP SACK compression, from Eric Dumazet. 25) Add User Mode Helper support and basic bpfilter infrastructure, from Alexei Starovoitov. 26) Support ports and protocol values in RTM_GETROUTE, from Roopa Prabhu. 27) Support bulking in ->ndo_xdp_xmit() API, from Jesper Dangaard Brouer. 28) Add lots of forwarding selftests, from Petr Machata. 29) Add generic network device failover driver, from Sridhar Samudrala. * ra.kernel.org:/pub/scm/linux/kernel/git/davem/net-next: (1959 commits) strparser: Add __strp_unpause and use it in ktls. rxrpc: Fix terminal retransmission connection ID to include the channel net: hns3: Optimize PF CMDQ interrupt switching process net: hns3: Fix for VF mailbox receiving unknown message net: hns3: Fix for VF mailbox cannot receiving PF response bnx2x: use the right constant Revert "net: sched: cls: Fix offloading when ingress dev is vxlan" net: dsa: b53: Fix for brcm tag issue in Cygnus SoC enic: fix UDP rss bits netdev-FAQ: clarify DaveM's position for stable backports rtnetlink: validate attributes in do_setlink() mlxsw: Add extack messages for port_{un, }split failures netdevsim: Add extack error message for devlink reload devlink: Add extack to reload and port_{un, }split operations net: metrics: add proper netlink validation ipmr: fix error path when ipmr_new_table fails ip6mr: only set ip6mr_table from setsockopt when ip6mr_new_table succeeds net: hns3: remove unused hclgevf_cfg_func_mta_filter netfilter: provide udp*_lib_lookup for nf_tproxy qed*: Utilize FW 8.37.2.0 ...
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/Kconfig51
-rw-r--r--net/netfilter/Makefile12
-rw-r--r--net/netfilter/core.c102
-rw-r--r--net/netfilter/ipvs/Kconfig37
-rw-r--r--net/netfilter/ipvs/Makefile1
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c24
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c467
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c540
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c101
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c5
-rw-r--r--net/netfilter/nf_conncount.c36
-rw-r--r--net/netfilter/nf_conntrack_core.c92
-rw-r--r--net/netfilter/nf_conntrack_ftp.c3
-rw-r--r--net/netfilter/nf_conntrack_irc.c6
-rw-r--r--net/netfilter/nf_conntrack_netlink.c13
-rw-r--r--net/netfilter/nf_conntrack_sane.c3
-rw-r--r--net/netfilter/nf_conntrack_sip.c2
-rw-r--r--net/netfilter/nf_conntrack_tftp.c2
-rw-r--r--net/netfilter/nf_flow_table_core.c (renamed from net/netfilter/nf_flow_table.c)309
-rw-r--r--net/netfilter/nf_flow_table_inet.c3
-rw-r--r--net/netfilter/nf_flow_table_ip.c489
-rw-r--r--net/netfilter/nf_internals.h5
-rw-r--r--net/netfilter/nf_nat_core.c321
-rw-r--r--net/netfilter/nf_nat_helper.c2
-rw-r--r--net/netfilter/nf_nat_proto_common.c9
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c2
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c2
-rw-r--r--net/netfilter/nf_nat_proto_tcp.c2
-rw-r--r--net/netfilter/nf_nat_proto_udp.c4
-rw-r--r--net/netfilter/nf_nat_proto_unknown.c2
-rw-r--r--net/netfilter/nf_nat_redirect.c10
-rw-r--r--net/netfilter/nf_nat_sip.c2
-rw-r--r--net/netfilter/nf_osf.c218
-rw-r--r--net/netfilter/nf_tables_api.c1325
-rw-r--r--net/netfilter/nf_tables_core.c72
-rw-r--r--net/netfilter/nfnetlink.c44
-rw-r--r--net/netfilter/nfnetlink_log.c8
-rw-r--r--net/netfilter/nfnetlink_queue.c28
-rw-r--r--net/netfilter/nft_compat.c29
-rw-r--r--net/netfilter/nft_connlimit.c297
-rw-r--r--net/netfilter/nft_counter.c4
-rw-r--r--net/netfilter/nft_ct.c3
-rw-r--r--net/netfilter/nft_dynset.c16
-rw-r--r--net/netfilter/nft_exthdr.c23
-rw-r--r--net/netfilter/nft_flow_offload.c5
-rw-r--r--net/netfilter/nft_fwd_netdev.c146
-rw-r--r--net/netfilter/nft_hash.c127
-rw-r--r--net/netfilter/nft_immediate.c27
-rw-r--r--net/netfilter/nft_log.c92
-rw-r--r--net/netfilter/nft_lookup.c47
-rw-r--r--net/netfilter/nft_meta.c112
-rw-r--r--net/netfilter/nft_nat.c2
-rw-r--r--net/netfilter/nft_numgen.c158
-rw-r--r--net/netfilter/nft_objref.c4
-rw-r--r--net/netfilter/nft_rt.c22
-rw-r--r--net/netfilter/nft_set_bitmap.c34
-rw-r--r--net/netfilter/nft_set_hash.c174
-rw-r--r--net/netfilter/nft_set_rbtree.c109
-rw-r--r--net/netfilter/nft_socket.c144
-rw-r--r--net/netfilter/xt_NETMAP.c8
-rw-r--r--net/netfilter/xt_NFLOG.c15
-rw-r--r--net/netfilter/xt_REDIRECT.c2
-rw-r--r--net/netfilter/xt_TPROXY.c366
-rw-r--r--net/netfilter/xt_nat.c72
-rw-r--r--net/netfilter/xt_osf.c202
-rw-r--r--net/netfilter/xt_socket.c4
74 files changed, 4792 insertions, 1840 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 44d8a55e9721..dbd7d1fad277 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -433,17 +433,16 @@ config NF_NAT_TFTP
default NF_NAT && NF_CONNTRACK_TFTP
config NF_NAT_REDIRECT
- tristate "IPv4/IPv6 redirect support"
- depends on NF_NAT
- help
- This is the kernel functionality to redirect packets to local
- machine through NAT.
+ bool
config NETFILTER_SYNPROXY
tristate
endif # NF_CONNTRACK
+config NF_OSF
+ tristate
+
config NF_TABLES
select NETFILTER_NETLINK
tristate "Netfilter nf_tables support"
@@ -474,24 +473,6 @@ config NF_TABLES_NETDEV
help
This option enables support for the "netdev" table.
-config NFT_EXTHDR
- tristate "Netfilter nf_tables exthdr module"
- help
- This option adds the "exthdr" expression that you can use to match
- IPv6 extension headers and tcp options.
-
-config NFT_META
- tristate "Netfilter nf_tables meta module"
- help
- This option adds the "meta" expression that you can use to match and
- to set packet metainformation such as the packet mark.
-
-config NFT_RT
- tristate "Netfilter nf_tables routing module"
- help
- This option adds the "rt" expression that you can use to match
- packet routing information such as the packet nexthop.
-
config NFT_NUMGEN
tristate "Netfilter nf_tables number generator module"
help
@@ -536,6 +517,15 @@ config NFT_COUNTER
This option adds the "counter" expression that you can use to
include packet and byte counters in a rule.
+config NFT_CONNLIMIT
+ tristate "Netfilter nf_tables connlimit module"
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_CONNCOUNT
+ help
+ This option adds the "connlimit" expression that you can use to
+ ratelimit rule matchings per connections.
+
config NFT_LOG
tristate "Netfilter nf_tables log module"
help
@@ -632,6 +622,15 @@ config NFT_FIB_INET
The lookup will be delegated to the IPv4 or IPv6 FIB depending
on the protocol of the packet.
+config NFT_SOCKET
+ tristate "Netfilter nf_tables socket match support"
+ depends on IPV6 || IPV6=n
+ select NF_SOCKET_IPV4
+ select NF_SOCKET_IPV6 if IPV6
+ help
+ This option allows matching for the presence or absence of a
+ corresponding socket and its attributes.
+
if NF_TABLES_NETDEV
config NF_DUP_NETDEV
@@ -667,8 +666,7 @@ endif # NF_TABLES
config NF_FLOW_TABLE_INET
tristate "Netfilter flow table mixed IPv4/IPv6 module"
- depends on NF_FLOW_TABLE_IPV4
- depends on NF_FLOW_TABLE_IPV6
+ depends on NF_FLOW_TABLE
help
This option adds the flow table mixed IPv4/IPv6 support.
@@ -1000,6 +998,8 @@ config NETFILTER_XT_TARGET_TPROXY
depends on IP_NF_MANGLE
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
+ select NF_TPROXY_IPV4
+ select NF_TPROXY_IPV6 if IP6_NF_IPTABLES
help
This option adds a `TPROXY' target, which is somewhat similar to
REDIRECT. It can only be used in the mangle table and is useful
@@ -1378,6 +1378,7 @@ config NETFILTER_XT_MATCH_NFACCT
config NETFILTER_XT_MATCH_OSF
tristate '"osf" Passive OS fingerprint match'
depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
+ select NF_OSF
help
This option selects the Passive OS Fingerprinting match module
that allows to passively match the remote operating system by
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index fd32bd2c9521..44449389e527 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
obj-$(CONFIG_NF_NAT) += nf_nat.o
-obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
+nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
# NAT helpers
obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
@@ -76,13 +76,11 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
- nft_dynset.o
+ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
-obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
-obj-$(CONFIG_NFT_META) += nft_meta.o
-obj-$(CONFIG_NFT_RT) += nft_rt.o
+obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
obj-$(CONFIG_NFT_CT) += nft_ct.o
obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o
@@ -104,6 +102,8 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o
obj-$(CONFIG_NFT_FIB) += nft_fib.o
obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o
obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o
+obj-$(CONFIG_NF_OSF) += nf_osf.o
+obj-$(CONFIG_NFT_SOCKET) += nft_socket.o
# nf_tables netdev
obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
@@ -111,6 +111,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
# flow table infrastructure
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
+nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
+
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
# generic X tables
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 206fb2c4c319..168af54db975 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -138,11 +138,6 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
continue;
}
- if (reg->nat_hook && orig_ops[i]->nat_hook) {
- kvfree(new);
- return ERR_PTR(-EBUSY);
- }
-
if (inserted || reg->priority > orig_ops[i]->priority) {
new_ops[nhooks] = (void *)orig_ops[i];
new->hooks[nhooks] = old->hooks[i];
@@ -186,9 +181,31 @@ static void hooks_validate(const struct nf_hook_entries *hooks)
#endif
}
+int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
+ const struct nf_hook_ops *reg)
+{
+ struct nf_hook_entries *new_hooks;
+ struct nf_hook_entries *p;
+
+ p = rcu_dereference_raw(*pp);
+ new_hooks = nf_hook_entries_grow(p, reg);
+ if (IS_ERR(new_hooks))
+ return PTR_ERR(new_hooks);
+
+ hooks_validate(new_hooks);
+
+ rcu_assign_pointer(*pp, new_hooks);
+
+ BUG_ON(p == new_hooks);
+ nf_hook_entries_free(p);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_hook_entries_insert_raw);
+
/*
* __nf_hook_entries_try_shrink - try to shrink hook array
*
+ * @old -- current hook blob at @pp
* @pp -- location of hook blob
*
* Hook unregistration must always succeed, so to-be-removed hooks
@@ -201,14 +218,14 @@ static void hooks_validate(const struct nf_hook_entries *hooks)
*
* Returns address to free, or NULL.
*/
-static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp)
+static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old,
+ struct nf_hook_entries __rcu **pp)
{
- struct nf_hook_entries *old, *new = NULL;
unsigned int i, j, skip = 0, hook_entries;
+ struct nf_hook_entries *new = NULL;
struct nf_hook_ops **orig_ops;
struct nf_hook_ops **new_ops;
- old = nf_entry_dereference(*pp);
if (WARN_ON_ONCE(!old))
return NULL;
@@ -347,11 +364,10 @@ static int __nf_register_net_hook(struct net *net, int pf,
* This cannot fail, hook unregistration must always succeed.
* Therefore replace the to-be-removed hook with a dummy hook.
*/
-static void nf_remove_net_hook(struct nf_hook_entries *old,
- const struct nf_hook_ops *unreg, int pf)
+static bool nf_remove_net_hook(struct nf_hook_entries *old,
+ const struct nf_hook_ops *unreg)
{
struct nf_hook_ops **orig_ops;
- bool found = false;
unsigned int i;
orig_ops = nf_hook_entries_get_hook_ops(old);
@@ -360,21 +376,10 @@ static void nf_remove_net_hook(struct nf_hook_entries *old,
continue;
WRITE_ONCE(old->hooks[i].hook, accept_all);
WRITE_ONCE(orig_ops[i], &dummy_ops);
- found = true;
- break;
+ return true;
}
- if (found) {
-#ifdef CONFIG_NETFILTER_INGRESS
- if (pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS)
- net_dec_ingress_queue();
-#endif
-#ifdef HAVE_JUMP_LABEL
- static_key_slow_dec(&nf_hooks_needed[pf][unreg->hooknum]);
-#endif
- } else {
- WARN_ONCE(1, "hook not found, pf %d num %d", pf, unreg->hooknum);
- }
+ return false;
}
static void __nf_unregister_net_hook(struct net *net, int pf,
@@ -395,9 +400,19 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
return;
}
- nf_remove_net_hook(p, reg, pf);
+ if (nf_remove_net_hook(p, reg)) {
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ net_dec_ingress_queue();
+#endif
+#ifdef HAVE_JUMP_LABEL
+ static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
+#endif
+ } else {
+ WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
+ }
- p = __nf_hook_entries_try_shrink(pp);
+ p = __nf_hook_entries_try_shrink(p, pp);
mutex_unlock(&nf_hook_mutex);
if (!p)
return;
@@ -417,6 +432,19 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
}
EXPORT_SYMBOL(nf_unregister_net_hook);
+void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
+ const struct nf_hook_ops *reg)
+{
+ struct nf_hook_entries *p;
+
+ p = rcu_dereference_raw(*pp);
+ if (nf_remove_net_hook(p, reg)) {
+ p = __nf_hook_entries_try_shrink(p, pp);
+ nf_hook_entries_free(p);
+ }
+}
+EXPORT_SYMBOL_GPL(nf_hook_entries_delete_raw);
+
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
int err;
@@ -535,6 +563,9 @@ EXPORT_SYMBOL(skb_make_writable);
struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nfnl_ct_hook);
+struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_hook);
+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* This does not belong here, but locally generated errors need it if connection
tracking in use: without this, connection may not be in hash table, and hence
@@ -543,6 +574,9 @@ void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
__rcu __read_mostly;
EXPORT_SYMBOL(ip_ct_attach);
+struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_hook);
+
void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
void (*attach)(struct sk_buff *, const struct sk_buff *);
@@ -557,17 +591,14 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
}
EXPORT_SYMBOL(nf_ct_attach);
-void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
-EXPORT_SYMBOL(nf_ct_destroy);
-
void nf_conntrack_destroy(struct nf_conntrack *nfct)
{
- void (*destroy)(struct nf_conntrack *);
+ struct nf_ct_hook *ct_hook;
rcu_read_lock();
- destroy = rcu_dereference(nf_ct_destroy);
- BUG_ON(destroy == NULL);
- destroy(nfct);
+ ct_hook = rcu_dereference(nf_ct_hook);
+ BUG_ON(ct_hook == NULL);
+ ct_hook->destroy(nfct);
rcu_read_unlock();
}
EXPORT_SYMBOL(nf_conntrack_destroy);
@@ -580,11 +611,6 @@ const struct nf_conntrack_zone nf_ct_zone_dflt = {
EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
#endif /* CONFIG_NF_CONNTRACK */
-#ifdef CONFIG_NF_NAT_NEEDED
-void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
-EXPORT_SYMBOL(nf_nat_decode_session_hook);
-#endif
-
static void __net_init
__netfilter_net_init(struct nf_hook_entries __rcu **e, int max)
{
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index b32fb0dbe237..05dc1b77e466 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -225,6 +225,25 @@ config IP_VS_SH
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_MH
+ tristate "maglev hashing scheduling"
+ ---help---
+ The maglev consistent hashing scheduling algorithm provides the
+ Google's Maglev hashing algorithm as a IPVS scheduler. It assigns
+ network connections to the servers through looking up a statically
+ assigned special hash table called the lookup table. Maglev hashing
+ is to assign a preference list of all the lookup table positions
+ to each destination.
+
+ Through this operation, The maglev hashing gives an almost equal
+ share of the lookup table to each of the destinations and provides
+ minimal disruption by using the lookup table. When the set of
+ destinations changes, a connection will likely be sent to the same
+ destination as it was before.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
config IP_VS_SED
tristate "shortest expected delay scheduling"
---help---
@@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS
needs to be large enough to effectively fit all the destinations
multiplied by their respective weights.
+comment 'IPVS MH scheduler'
+
+config IP_VS_MH_TAB_INDEX
+ int "IPVS maglev hashing table index of size (the prime numbers)"
+ range 8 17
+ default 12
+ ---help---
+ The maglev hashing scheduler maps source IPs to destinations
+ stored in a hash table. This table is assigned by a preference
+ list of the positions to each destination until all slots in
+ the table are filled. The index determines the prime for size of
+ the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
+ 65521 or 131071. When using weights to allow destinations to
+ receive more connections, the table is assigned an amount
+ proportional to the weights specified. The table needs to be large
+ enough to effectively fit all the destinations multiplied by their
+ respective weights.
+
comment 'IPVS application helper'
config IP_VS_FTP
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index c552993fa4b9..bfce2677fda2 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index c3db074fc1f7..7588aeaa605f 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
}
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
- struct ip_vs_app *app)
+ struct ip_vs_app *app,
+ struct ip_vs_iphdr *ipvsh)
{
int diff;
const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
if (app->pkt_out == NULL)
return 1;
- if (!app->pkt_out(app, cp, skb, &diff))
+ if (!app->pkt_out(app, cp, skb, &diff, ipvsh))
return 0;
/*
@@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
* called by ipvs packet handler, assumes previously checked cp!=NULL
* returns false if it can't handle packet (oom)
*/
-int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_iphdr *ipvsh)
{
struct ip_vs_app *app;
@@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
- return app_tcp_pkt_out(cp, skb, app);
+ return app_tcp_pkt_out(cp, skb, app, ipvsh);
/*
* Call private output hook function
@@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
if (app->pkt_out == NULL)
return 1;
- return app->pkt_out(app, cp, skb, NULL);
+ return app->pkt_out(app, cp, skb, NULL, ipvsh);
}
static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
- struct ip_vs_app *app)
+ struct ip_vs_app *app,
+ struct ip_vs_iphdr *ipvsh)
{
int diff;
const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
if (app->pkt_in == NULL)
return 1;
- if (!app->pkt_in(app, cp, skb, &diff))
+ if (!app->pkt_in(app, cp, skb, &diff, ipvsh))
return 0;
/*
@@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
* called by ipvs packet handler, assumes previously checked cp!=NULL.
* returns false if can't handle packet (oom).
*/
-int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_iphdr *ipvsh)
{
struct ip_vs_app *app;
@@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
- return app_tcp_pkt_in(cp, skb, app);
+ return app_tcp_pkt_in(cp, skb, app, ipvsh);
/*
* Call private input hook function
@@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
if (app->pkt_in == NULL)
return 1;
- return app->pkt_in(app, cp, skb, NULL);
+ return app->pkt_in(app, cp, skb, NULL, ipvsh);
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 141b1509c948..0c03c0e16a96 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
if (add && udest->af != svc->af)
ipvs->mixed_address_family_dests++;
+ /* keep the last_weight with latest non-0 weight */
+ if (add || udest->weight != 0)
+ atomic_set(&dest->last_weight, udest->weight);
+
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 75f798f8e83b..07459e71d907 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -43,6 +43,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <linux/hash.h>
#include <net/ip_vs.h>
@@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+ return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS);
}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 58d5d05aec24..4398a72edec5 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -29,6 +29,8 @@
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
@@ -44,9 +46,18 @@
#include <net/ip_vs.h>
-#define SERVER_STRING "227 "
-#define CLIENT_STRING "PORT"
+#define SERVER_STRING_PASV "227 "
+#define CLIENT_STRING_PORT "PORT"
+#define SERVER_STRING_EPSV "229 "
+#define CLIENT_STRING_EPRT "EPRT"
+enum {
+ IP_VS_FTP_ACTIVE = 0,
+ IP_VS_FTP_PORT = 0,
+ IP_VS_FTP_PASV,
+ IP_VS_FTP_EPRT,
+ IP_VS_FTP_EPSV,
+};
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444);
MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
-/* Dummy variable */
-static int ip_vs_ftp_pasv;
+static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh)
+{
+ struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len);
+
+ if ((th->doff << 2) < sizeof(struct tcphdr))
+ return NULL;
+ return (char *)th + (th->doff << 2);
+}
static int
ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
@@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
}
-/*
- * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern", ignoring before "skip" and terminated with
- * the "term" character.
- * <addr,port> is in network order.
+/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern". <addr,port> is in network order.
+ * Parse extended format depending on ext. In this case addr can be pre-set.
*/
static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
const char *pattern, size_t plen,
- char skip, char term,
- __be32 *addr, __be16 *port,
- char **start, char **end)
+ char skip, bool ext, int mode,
+ union nf_inet_addr *addr, __be16 *port,
+ __u16 af, char **start, char **end)
{
char *s, c;
unsigned char p[6];
+ char edelim;
+ __u16 hport;
int i = 0;
if (data_limit - data < plen) {
@@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
if (s == data_limit)
return -1;
if (!found) {
+ /* "(" is optional for non-extended format,
+ * so catch the start of IPv4 address
+ */
+ if (!ext && isdigit(*s))
+ break;
if (*s == skip)
found = 1;
} else if (*s != skip) {
@@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
}
}
}
+ /* Old IPv4-only format? */
+ if (!ext) {
+ p[0] = 0;
+ for (data = s; ; data++) {
+ if (data == data_limit)
+ return -1;
+ c = *data;
+ if (isdigit(c)) {
+ p[i] = p[i]*10 + c - '0';
+ } else if (c == ',' && i < 5) {
+ i++;
+ p[i] = 0;
+ } else {
+ /* unexpected character or terminator */
+ break;
+ }
+ }
- for (data = s; ; data++) {
- if (data == data_limit)
+ if (i != 5)
return -1;
- if (*data == term)
- break;
+
+ *start = s;
+ *end = data;
+ addr->ip = get_unaligned((__be32 *) p);
+ *port = get_unaligned((__be16 *) (p + 4));
+ return 1;
}
- *end = data;
+ if (s == data_limit)
+ return -1;
+ *start = s;
+ edelim = *s++;
+ if (edelim < 33 || edelim > 126)
+ return -1;
+ if (s == data_limit)
+ return -1;
+ if (*s == edelim) {
+ /* Address family is usually missing for EPSV response */
+ if (mode != IP_VS_FTP_EPSV)
+ return -1;
+ s++;
+ if (s == data_limit)
+ return -1;
+ /* Then address should be missing too */
+ if (*s != edelim)
+ return -1;
+ /* Caller can pre-set addr, if needed */
+ s++;
+ } else {
+ const char *ep;
- memset(p, 0, sizeof(p));
- for (data = s; ; data++) {
- c = *data;
- if (c == term)
- break;
- if (c >= '0' && c <= '9') {
- p[i] = p[i]*10 + c - '0';
- } else if (c == ',' && i < 5) {
- i++;
- } else {
- /* unexpected character */
+ /* We allow address only from same family */
+ if (af == AF_INET6 && *s != '2')
return -1;
+ if (af == AF_INET && *s != '1')
+ return -1;
+ s++;
+ if (s == data_limit)
+ return -1;
+ if (*s != edelim)
+ return -1;
+ s++;
+ if (s == data_limit)
+ return -1;
+ if (af == AF_INET6) {
+ if (in6_pton(s, data_limit - s, (u8 *)addr, edelim,
+ &ep) <= 0)
+ return -1;
+ } else {
+ if (in4_pton(s, data_limit - s, (u8 *)addr, edelim,
+ &ep) <= 0)
+ return -1;
}
+ s = (char *) ep;
+ if (s == data_limit)
+ return -1;
+ if (*s != edelim)
+ return -1;
+ s++;
}
-
- if (i != 5)
+ for (hport = 0; ; s++)
+ {
+ if (s == data_limit)
+ return -1;
+ if (!isdigit(*s))
+ break;
+ hport = hport * 10 + *s - '0';
+ }
+ if (s == data_limit || !hport || *s != edelim)
return -1;
-
- *start = s;
- *addr = get_unaligned((__be32 *) p);
- *port = get_unaligned((__be16 *) (p + 4));
+ s++;
+ *end = s;
+ *port = htons(hport);
return 1;
}
-/*
- * Look at outgoing ftp packets to catch the response to a PASV command
+/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command
* from the server (inside-to-outside).
* When we see one, we build a connection entry with the client address,
* client port 0 (unknown at the moment), the server address and the
@@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
* The outgoing packet should be something like
* "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
* xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ * The extended format for EPSV response provides usually only port:
+ * "229 Entering Extended Passive Mode (|||ppp|)"
*/
static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
- struct sk_buff *skb, int *diff)
+ struct sk_buff *skb, int *diff,
+ struct ip_vs_iphdr *ipvsh)
{
- struct iphdr *iph;
- struct tcphdr *th;
char *data, *data_limit;
char *start, *end;
union nf_inet_addr from;
@@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
*diff = 0;
-#ifdef CONFIG_IP_VS_IPV6
- /* This application helper doesn't work with IPv6 yet,
- * so turn this into a no-op for IPv6 packets
- */
- if (cp->af == AF_INET6)
- return 1;
-#endif
-
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
@@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
if (!skb_make_writable(skb, skb->len))
return 0;
- if (cp->app_data == &ip_vs_ftp_pasv) {
- iph = ip_hdr(skb);
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
- data = (char *)th + (th->doff << 2);
+ if (cp->app_data == (void *) IP_VS_FTP_PASV) {
+ data = ip_vs_ftp_data_ptr(skb, ipvsh);
data_limit = skb_tail_pointer(skb);
+ if (!data || data >= data_limit)
+ return 1;
+
if (ip_vs_ftp_get_addrport(data, data_limit,
- SERVER_STRING,
- sizeof(SERVER_STRING)-1,
- '(', ')',
- &from.ip, &port,
+ SERVER_STRING_PASV,
+ sizeof(SERVER_STRING_PASV)-1,
+ '(', false, IP_VS_FTP_PASV,
+ &from, &port, cp->af,
&start, &end) != 1)
return 1;
- IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+ IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n",
&from.ip, ntohs(port), &cp->caddr.ip, 0);
+ } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+ data = ip_vs_ftp_data_ptr(skb, ipvsh);
+ data_limit = skb_tail_pointer(skb);
- /*
- * Now update or create an connection entry for it
+ if (!data || data >= data_limit)
+ return 1;
+
+ /* Usually, data address is not specified but
+ * we support different address, so pre-set it.
*/
- {
- struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->ipvs, AF_INET,
- iph->protocol, &from, port,
- &cp->caddr, 0, &p);
- n_cp = ip_vs_conn_out_get(&p);
- }
- if (!n_cp) {
- struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->ipvs,
- AF_INET, IPPROTO_TCP, &cp->caddr,
- 0, &cp->vaddr, port, &p);
- /* As above, this is ipv4 only */
- n_cp = ip_vs_conn_new(&p, AF_INET, &from, port,
- IP_VS_CONN_F_NO_CPORT |
- IP_VS_CONN_F_NFCT,
- cp->dest, skb->mark);
- if (!n_cp)
- return 0;
+ from = cp->daddr;
+ if (ip_vs_ftp_get_addrport(data, data_limit,
+ SERVER_STRING_EPSV,
+ sizeof(SERVER_STRING_EPSV)-1,
+ '(', true, IP_VS_FTP_EPSV,
+ &from, &port, cp->af,
+ &start, &end) != 1)
+ return 1;
- /* add its controller */
- ip_vs_control_add(n_cp, cp);
- }
+ IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n",
+ IP_VS_DBG_ADDR(cp->af, &from), ntohs(port),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0);
+ } else {
+ return 1;
+ }
- /*
- * Replace the old passive address with the new one
- */
+ /* Now update or create a connection entry for it */
+ {
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(cp->ipvs, cp->af,
+ ipvsh->protocol, &from, port,
+ &cp->caddr, 0, &p);
+ n_cp = ip_vs_conn_out_get(&p);
+ }
+ if (!n_cp) {
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(cp->ipvs,
+ cp->af, ipvsh->protocol, &cp->caddr,
+ 0, &cp->vaddr, port, &p);
+ n_cp = ip_vs_conn_new(&p, cp->af, &from, port,
+ IP_VS_CONN_F_NO_CPORT |
+ IP_VS_CONN_F_NFCT,
+ cp->dest, skb->mark);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+
+ /* Replace the old passive address with the new one */
+ if (cp->app_data == (void *) IP_VS_FTP_PASV) {
from.ip = n_cp->vaddr.ip;
port = n_cp->vport;
snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
@@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
((unsigned char *)&from.ip)[3],
ntohs(port) >> 8,
ntohs(port) & 0xFF);
+ } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+ from = n_cp->vaddr;
+ port = n_cp->vport;
+ /* Only port, client will use VIP for the data connection */
+ snprintf(buf, sizeof(buf), "|||%u|",
+ ntohs(port));
+ } else {
+ *buf = 0;
+ }
+ buf_len = strlen(buf);
- buf_len = strlen(buf);
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct) {
- bool mangled;
-
- /* If mangling fails this function will return 0
- * which will cause the packet to be dropped.
- * Mangling can only fail under memory pressure,
- * hopefully it will succeed on the retransmitted
- * packet.
- */
- mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
- iph->ihl * 4,
- start - data,
- end - start,
- buf, buf_len);
- if (mangled) {
- ip_vs_nfct_expect_related(skb, ct, n_cp,
- IPPROTO_TCP, 0, 0);
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- /* csum is updated */
- ret = 1;
- }
- }
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ bool mangled;
- /*
- * Not setting 'diff' is intentional, otherwise the sequence
- * would be adjusted twice.
+ /* If mangling fails this function will return 0
+ * which will cause the packet to be dropped.
+ * Mangling can only fail under memory pressure,
+ * hopefully it will succeed on the retransmitted
+ * packet.
*/
-
- cp->app_data = NULL;
- ip_vs_tcp_conn_listen(n_cp);
- ip_vs_conn_put(n_cp);
- return ret;
+ mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ ipvsh->len,
+ start - data,
+ end - start,
+ buf, buf_len);
+ if (mangled) {
+ ip_vs_nfct_expect_related(skb, ct, n_cp,
+ ipvsh->protocol, 0, 0);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ /* csum is updated */
+ ret = 1;
+ }
}
- return 1;
+
+ /* Not setting 'diff' is intentional, otherwise the sequence
+ * would be adjusted twice.
+ */
+
+ cp->app_data = (void *) IP_VS_FTP_ACTIVE;
+ ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_conn_put(n_cp);
+ return ret;
}
-/*
- * Look at incoming ftp packets to catch the PASV/PORT command
+/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command
* (outside-to-inside).
*
* The incoming packet having the PORT command should be something like
@@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* In this case, we create a connection entry using the client address and
* port, so that the active ftp data connection from the server can reach
* the client.
+ * Extended format:
+ * "EPSV\r\n" when client requests server address from same family
+ * "EPSV 1\r\n" when client requests IPv4 server address
+ * "EPSV 2\r\n" when client requests IPv6 server address
+ * "EPSV ALL\r\n" - not supported
+ * EPRT with specified delimiter (ASCII 33..126), "|" by default:
+ * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport
+ * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport
*/
static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
- struct sk_buff *skb, int *diff)
+ struct sk_buff *skb, int *diff,
+ struct ip_vs_iphdr *ipvsh)
{
- struct iphdr *iph;
- struct tcphdr *th;
char *data, *data_start, *data_limit;
char *start, *end;
union nf_inet_addr to;
@@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
/* no diff required for incoming packets */
*diff = 0;
-#ifdef CONFIG_IP_VS_IPV6
- /* This application helper doesn't work with IPv6 yet,
- * so turn this into a no-op for IPv6 packets
- */
- if (cp->af == AF_INET6)
- return 1;
-#endif
-
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
@@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
if (!skb_make_writable(skb, skb->len))
return 0;
- /*
- * Detecting whether it is passive
- */
- iph = ip_hdr(skb);
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
- /* Since there may be OPTIONS in the TCP packet and the HLEN is
- the length of the header in 32-bit multiples, it is accurate
- to calculate data address by th+HLEN*4 */
- data = data_start = (char *)th + (th->doff << 2);
+ data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
data_limit = skb_tail_pointer(skb);
+ if (!data || data >= data_limit)
+ return 1;
while (data <= data_limit - 6) {
- if (strncasecmp(data, "PASV\r\n", 6) == 0) {
+ if (cp->af == AF_INET &&
+ strncasecmp(data, "PASV\r\n", 6) == 0) {
/* Passive mode on */
IP_VS_DBG(7, "got PASV at %td of %td\n",
data - data_start,
data_limit - data_start);
- cp->app_data = &ip_vs_ftp_pasv;
+ cp->app_data = (void *) IP_VS_FTP_PASV;
return 1;
}
+
+ /* EPSV or EPSV<space><net-prt> */
+ if (strncasecmp(data, "EPSV", 4) == 0 &&
+ (data[4] == ' ' || data[4] == '\r')) {
+ if (data[4] == ' ') {
+ char proto = data[5];
+
+ if (data > data_limit - 7 || data[6] != '\r')
+ return 1;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && proto == '2') {
+ } else
+#endif
+ if (cp->af == AF_INET && proto == '1') {
+ } else {
+ return 1;
+ }
+ }
+ /* Extended Passive mode on */
+ IP_VS_DBG(7, "got EPSV at %td of %td\n",
+ data - data_start,
+ data_limit - data_start);
+ cp->app_data = (void *) IP_VS_FTP_EPSV;
+ return 1;
+ }
+
data++;
}
@@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
* then create a new connection entry for the coming data
* connection.
*/
- if (ip_vs_ftp_get_addrport(data_start, data_limit,
- CLIENT_STRING, sizeof(CLIENT_STRING)-1,
- ' ', '\r', &to.ip, &port,
- &start, &end) != 1)
+ if (cp->af == AF_INET &&
+ ip_vs_ftp_get_addrport(data_start, data_limit,
+ CLIENT_STRING_PORT,
+ sizeof(CLIENT_STRING_PORT)-1,
+ ' ', false, IP_VS_FTP_PORT,
+ &to, &port, cp->af,
+ &start, &end) == 1) {
+
+ IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port));
+
+ /* Now update or create a connection entry for it */
+ IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n",
+ ip_vs_proto_name(ipvsh->protocol),
+ &to.ip, ntohs(port), &cp->vaddr.ip,
+ ntohs(cp->vport)-1);
+ } else if (ip_vs_ftp_get_addrport(data_start, data_limit,
+ CLIENT_STRING_EPRT,
+ sizeof(CLIENT_STRING_EPRT)-1,
+ ' ', true, IP_VS_FTP_EPRT,
+ &to, &port, cp->af,
+ &start, &end) == 1) {
+
+ IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n",
+ IP_VS_DBG_ADDR(cp->af, &to), ntohs(port));
+
+ /* Now update or create a connection entry for it */
+ IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n",
+ ip_vs_proto_name(ipvsh->protocol),
+ IP_VS_DBG_ADDR(cp->af, &to), ntohs(port),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport)-1);
+ } else {
return 1;
-
- IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+ }
/* Passive mode off */
- cp->app_data = NULL;
-
- /*
- * Now update or create a connection entry for it
- */
- IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
- ip_vs_proto_name(iph->protocol),
- &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+ cp->app_data = (void *) IP_VS_FTP_ACTIVE;
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->ipvs, AF_INET,
- iph->protocol, &to, port, &cp->vaddr,
+ ip_vs_conn_fill_param(cp->ipvs, cp->af,
+ ipvsh->protocol, &to, port, &cp->vaddr,
htons(ntohs(cp->vport)-1), &p);
n_cp = ip_vs_conn_in_get(&p);
if (!n_cp) {
- /* This is ipv4 only */
- n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr,
+ n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr,
htons(ntohs(cp->dport)-1),
IP_VS_CONN_F_NFCT, cp->dest,
skb->mark);
@@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
if (ret)
goto err_unreg;
- pr_info("%s: loaded support on port[%d] = %d\n",
+ pr_info("%s: loaded support on port[%d] = %u\n",
app->name, i, ports[i]);
}
return 0;
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 3057e453bf31..b9f375e6dc93 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -48,6 +48,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/jiffies.h>
+#include <linux/hash.h>
/* for sysctl */
#include <linux/fs.h>
@@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+ return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
}
@@ -371,6 +372,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
tbl->counter = 1;
tbl->dead = false;
tbl->svc = svc;
+ atomic_set(&tbl->entries, 0);
/*
* Hook periodic timer for garbage collection
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 92adc04557ed..542c4949937a 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -47,6 +47,7 @@
#include <linux/jiffies.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/hash.h>
/* for sysctl */
#include <linux/fs.h>
@@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+ return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS);
}
@@ -534,6 +535,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
tbl->counter = 1;
tbl->dead = false;
tbl->svc = svc;
+ atomic_set(&tbl->entries, 0);
/*
* Hook periodic timer for garbage collection
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
new file mode 100644
index 000000000000..0f795b186eb3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/* IPVS: Maglev Hashing scheduling module
+ *
+ * Authors: Inju Song <inju.song@navercorp.com>
+ *
+ */
+
+/* The mh algorithm is to assign a preference list of all the lookup
+ * table positions to each destination and populate the table with
+ * the most-preferred position of destinations. Then it is to select
+ * destination with the hash key of source IP address through looking
+ * up a the lookup table.
+ *
+ * The algorithm is detailed in:
+ * [3.4 Consistent Hasing]
+https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <linux/gcd.h>
+
+#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */
+#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */
+
+struct ip_vs_mh_lookup {
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
+};
+
+struct ip_vs_mh_dest_setup {
+ unsigned int offset; /* starting offset */
+ unsigned int skip; /* skip */
+ unsigned int perm; /* next_offset */
+ int turns; /* weight / gcd() and rshift */
+};
+
+/* Available prime numbers for MH table */
+static int primes[] = {251, 509, 1021, 2039, 4093,
+ 8191, 16381, 32749, 65521, 131071};
+
+/* For IPVS MH entry hash table */
+#ifndef CONFIG_IP_VS_MH_TAB_INDEX
+#define CONFIG_IP_VS_MH_TAB_INDEX 12
+#endif
+#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2)
+#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8)
+#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX]
+
+struct ip_vs_mh_state {
+ struct rcu_head rcu_head;
+ struct ip_vs_mh_lookup *lookup;
+ struct ip_vs_mh_dest_setup *dest_setup;
+ hsiphash_key_t hash1, hash2;
+ int gcd;
+ int rshift;
+};
+
+static inline void generate_hash_secret(hsiphash_key_t *hash1,
+ hsiphash_key_t *hash2)
+{
+ hash1->key[0] = 2654435761UL;
+ hash1->key[1] = 2654435761UL;
+
+ hash2->key[0] = 2654446892UL;
+ hash2->key[1] = 2654446892UL;
+}
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->weight) <= 0 ||
+ dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/* Returns hash value for IPVS MH entry */
+static inline unsigned int
+ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
+ __be16 port, hsiphash_key_t *key, unsigned int offset)
+{
+ unsigned int v;
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
+ addr->ip6[2] ^ addr->ip6[3];
+#endif
+ v = (offset + ntohs(port) + ntohl(addr_fold));
+ return hsiphash(&v, sizeof(v), key);
+}
+
+/* Reset all the hash buckets of the specified table. */
+static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
+{
+ int i;
+ struct ip_vs_mh_lookup *l;
+ struct ip_vs_dest *dest;
+
+ l = &s->lookup[0];
+ for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(l->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(l->dest, NULL);
+ }
+ l++;
+ }
+}
+
+static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
+{
+ struct list_head *p;
+ struct ip_vs_mh_dest_setup *ds;
+ struct ip_vs_dest *dest;
+ int lw;
+
+ /* If gcd is smaller then 1, number of dests or
+ * all last_weight of dests are zero. So, skip
+ * permutation for the dests.
+ */
+ if (s->gcd < 1)
+ return 0;
+
+ /* Set dest_setup for the dests permutation */
+ p = &svc->destinations;
+ ds = &s->dest_setup[0];
+ while ((p = p->next) != &svc->destinations) {
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+
+ ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
+ dest->port, &s->hash1, 0) %
+ IP_VS_MH_TAB_SIZE;
+ ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
+ dest->port, &s->hash2, 0) %
+ (IP_VS_MH_TAB_SIZE - 1) + 1;
+ ds->perm = ds->offset;
+
+ lw = atomic_read(&dest->last_weight);
+ ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
+ ds++;
+ }
+
+ return 0;
+}
+
+static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
+{
+ int n, c, dt_count;
+ unsigned long *table;
+ struct list_head *p;
+ struct ip_vs_mh_dest_setup *ds;
+ struct ip_vs_dest *dest, *new_dest;
+
+ /* If gcd is smaller then 1, number of dests or
+ * all last_weight of dests are zero. So, skip
+ * the population for the dests and reset lookup table.
+ */
+ if (s->gcd < 1) {
+ ip_vs_mh_reset(s);
+ return 0;
+ }
+
+ table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ p = &svc->destinations;
+ n = 0;
+ dt_count = 0;
+ while (n < IP_VS_MH_TAB_SIZE) {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ ds = &s->dest_setup[0];
+ while (p != &svc->destinations) {
+ /* Ignore added server with zero weight */
+ if (ds->turns < 1) {
+ p = p->next;
+ ds++;
+ continue;
+ }
+
+ c = ds->perm;
+ while (test_bit(c, table)) {
+ /* Add skip, mod IP_VS_MH_TAB_SIZE */
+ ds->perm += ds->skip;
+ if (ds->perm >= IP_VS_MH_TAB_SIZE)
+ ds->perm -= IP_VS_MH_TAB_SIZE;
+ c = ds->perm;
+ }
+
+ __set_bit(c, table);
+
+ dest = rcu_dereference_protected(s->lookup[c].dest, 1);
+ new_dest = list_entry(p, struct ip_vs_dest, n_list);
+ if (dest != new_dest) {
+ if (dest)
+ ip_vs_dest_put(dest);
+ ip_vs_dest_hold(new_dest);
+ RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
+ }
+
+ if (++n == IP_VS_MH_TAB_SIZE)
+ goto out;
+
+ if (++dt_count >= ds->turns) {
+ dt_count = 0;
+ p = p->next;
+ ds++;
+ }
+ }
+ }
+
+out:
+ kfree(table);
+ return 0;
+}
+
+/* Get ip_vs_dest associated with supplied parameters. */
+static inline struct ip_vs_dest *
+ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
+ % IP_VS_MH_TAB_SIZE;
+ struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
+
+ return (!dest || is_unavailable(dest)) ? NULL : dest;
+}
+
+/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
+static inline struct ip_vs_dest *
+ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
+ struct ip_vs_dest *dest;
+
+ /* First try the dest it's supposed to go to */
+ ihash = ip_vs_mh_hashkey(svc->af, addr, port,
+ &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
+ dest = rcu_dereference(s->lookup[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+ /* If the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
+ for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
+ roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
+ hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
+ roffset) % IP_VS_MH_TAB_SIZE;
+ dest = rcu_dereference(s->lookup[hash].dest);
+ if (!dest)
+ break;
+ if (!is_unavailable(dest))
+ return dest;
+ IP_VS_DBG_BUF(6,
+ "MH: selected unavailable server %s:%u (offset %u), reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port), roffset);
+ }
+
+ return NULL;
+}
+
+/* Assign all the hash buckets of the specified table with the service. */
+static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
+{
+ int ret;
+
+ if (svc->num_dests > IP_VS_MH_TAB_SIZE)
+ return -EINVAL;
+
+ if (svc->num_dests >= 1) {
+ s->dest_setup = kcalloc(svc->num_dests,
+ sizeof(struct ip_vs_mh_dest_setup),
+ GFP_KERNEL);
+ if (!s->dest_setup)
+ return -ENOMEM;
+ }
+
+ ip_vs_mh_permutate(s, svc);
+
+ ret = ip_vs_mh_populate(s, svc);
+ if (ret < 0)
+ goto out;
+
+ IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port));
+
+out:
+ if (svc->num_dests >= 1) {
+ kfree(s->dest_setup);
+ s->dest_setup = NULL;
+ }
+ return ret;
+}
+
+static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int weight;
+ int g = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ weight = atomic_read(&dest->last_weight);
+ if (weight > 0) {
+ if (g > 0)
+ g = gcd(weight, g);
+ else
+ g = weight;
+ }
+ }
+ return g;
+}
+
+/* To avoid assigning huge weight for the MH table,
+ * calculate shift value with gcd.
+ */
+static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
+{
+ struct ip_vs_dest *dest;
+ int new_weight, weight = 0;
+ int mw, shift;
+
+ /* If gcd is smaller then 1, number of dests or
+ * all last_weight of dests are zero. So, return
+ * shift value as zero.
+ */
+ if (gcd < 1)
+ return 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ new_weight = atomic_read(&dest->last_weight);
+ if (new_weight > weight)
+ weight = new_weight;
+ }
+
+ /* Because gcd is greater than zero,
+ * the maximum weight and gcd are always greater than zero
+ */
+ mw = weight / gcd;
+
+ /* shift = occupied bits of weight/gcd - MH highest bits */
+ shift = fls(mw) - IP_VS_MH_TAB_BITS;
+ return (shift >= 0) ? shift : 0;
+}
+
+static void ip_vs_mh_state_free(struct rcu_head *head)
+{
+ struct ip_vs_mh_state *s;
+
+ s = container_of(head, struct ip_vs_mh_state, rcu_head);
+ kfree(s->lookup);
+ kfree(s);
+}
+
+static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
+{
+ int ret;
+ struct ip_vs_mh_state *s;
+
+ /* Allocate the MH table for this service */
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
+ GFP_KERNEL);
+ if (!s->lookup) {
+ kfree(s);
+ return -ENOMEM;
+ }
+
+ generate_hash_secret(&s->hash1, &s->hash2);
+ s->gcd = ip_vs_mh_gcd_weight(svc);
+ s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+ IP_VS_DBG(6,
+ "MH lookup table (memory=%zdbytes) allocated for current service\n",
+ sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+
+ /* Assign the lookup table with current dests */
+ ret = ip_vs_mh_reassign(s, svc);
+ if (ret < 0) {
+ ip_vs_mh_reset(s);
+ ip_vs_mh_state_free(&s->rcu_head);
+ return ret;
+ }
+
+ /* No more failures, attach state */
+ svc->sched_data = s;
+ return 0;
+}
+
+static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_mh_state *s = svc->sched_data;
+
+ /* Got to clean up lookup entry here */
+ ip_vs_mh_reset(s);
+
+ call_rcu(&s->rcu_head, ip_vs_mh_state_free);
+ IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
+ sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+}
+
+static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_mh_state *s = svc->sched_data;
+
+ s->gcd = ip_vs_mh_gcd_weight(svc);
+ s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+ /* Assign the lookup table with the updated service */
+ return ip_vs_mh_reassign(s, svc);
+}
+
+/* Helper function to get port number */
+static inline __be16
+ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+ __be16 _ports[2], *ports;
+
+ /* At this point we know that we have a valid packet of some kind.
+ * Because ICMP packets are only guaranteed to have the first 8
+ * bytes, let's just grab the ports. Fortunately they're in the
+ * same position for all three of the protocols we care about.
+ */
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_SCTP:
+ ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
+ &_ports);
+ if (unlikely(!ports))
+ return 0;
+
+ if (likely(!ip_vs_iph_inverse(iph)))
+ return ports[0];
+ else
+ return ports[1];
+ default:
+ return 0;
+ }
+}
+
+/* Maglev Hashing scheduling */
+static struct ip_vs_dest *
+ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_mh_state *s;
+ __be16 port = 0;
+ const union nf_inet_addr *hash_addr;
+
+ hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
+
+ IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
+ port = ip_vs_mh_get_port(skb, iph);
+
+ s = (struct ip_vs_mh_state *)svc->sched_data;
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
+ dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
+ else
+ dest = ip_vs_mh_get(svc, s, hash_addr, port);
+
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
+ IP_VS_DBG_ADDR(svc->af, hash_addr),
+ ntohs(port),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+/* IPVS MH Scheduler structure */
+static struct ip_vs_scheduler ip_vs_mh_scheduler = {
+ .name = "mh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
+ .init_service = ip_vs_mh_init_svc,
+ .done_service = ip_vs_mh_done_svc,
+ .add_dest = ip_vs_mh_dest_changed,
+ .del_dest = ip_vs_mh_dest_changed,
+ .upd_dest = ip_vs_mh_dest_changed,
+ .schedule = ip_vs_mh_schedule,
+};
+
+static int __init ip_vs_mh_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
+}
+
+static void __exit ip_vs_mh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
+ rcu_barrier();
+}
+
+module_init(ip_vs_mh_init);
+module_exit(ip_vs_mh_cleanup);
+MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 6cf3fd81a5ec..eb8b9c883889 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -67,15 +67,20 @@
#include <net/netfilter/nf_conntrack_zones.h>
-#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
- &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+#define FMT_TUPLE "%s:%u->%s:%u/%u"
+#define ARG_TUPLE(T) IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3), \
+ ntohs((T)->src.u.all), \
+ IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3), \
+ ntohs((T)->dst.u.all), \
(T)->dst.protonum
-#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
- &((C)->vaddr.ip), ntohs((C)->vport), \
- &((C)->daddr.ip), ntohs((C)->dport), \
+#define FMT_CONN "%s:%u->%s:%u->%s:%u/%u:%u"
+#define ARG_CONN(C) IP_VS_DBG_ADDR((C)->af, &((C)->caddr)), \
+ ntohs((C)->cport), \
+ IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)), \
+ ntohs((C)->vport), \
+ IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)), \
+ ntohs((C)->dport), \
(C)->protocol, (C)->state
void
@@ -127,13 +132,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
new_tuple.dst.protonum != IPPROTO_ICMPV6)
new_tuple.dst.u.tcp.port = cp->vport;
}
- IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
- "ctinfo=%d, old reply=" FMT_TUPLE
- ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
- __func__, ct, ct->status, ctinfo,
- ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
- ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, old reply=" FMT_TUPLE "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple));
+ IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, new reply=" FMT_TUPLE "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&new_tuple));
nf_conntrack_alter_reply(ct, &new_tuple);
+ IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n",
+ __func__, ct, ARG_CONN(cp));
}
int ip_vs_confirm_conntrack(struct sk_buff *skb)
@@ -152,9 +161,6 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
struct ip_vs_conn_param p;
struct net *net = nf_ct_net(ct);
- if (exp->tuple.src.l3num != PF_INET)
- return;
-
/*
* We assume that no NF locks are held before this callback.
* ip_vs_conn_out_get and ip_vs_conn_in_get should match their
@@ -171,19 +177,15 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
cp = ip_vs_conn_out_get(&p);
if (cp) {
/* Change reply CLIENT->RS to CLIENT->VS */
+ IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp="
+ FMT_CONN "\n",
+ __func__, ct, ct->status, ARG_CONN(cp));
new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
- __func__, ct, ct->status,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&new_reply));
new_reply.dst.u3 = cp->vaddr;
new_reply.dst.u.tcp.port = cp->vport;
- IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
- ", inout cp=" FMT_CONN "\n",
- __func__, ct,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
goto alter;
}
@@ -191,25 +193,21 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
cp = ip_vs_conn_in_get(&p);
if (cp) {
/* Change reply VS->CLIENT to RS->CLIENT */
+ IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp="
+ FMT_CONN "\n",
+ __func__, ct, ct->status, ARG_CONN(cp));
new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
- __func__, ct, ct->status,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&new_reply));
new_reply.src.u3 = cp->daddr;
new_reply.src.u.tcp.port = cp->dport;
- IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", outin cp=" FMT_CONN "\n",
- __func__, ct,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
goto alter;
}
- IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
- " - unknown expect\n",
- __func__, ct, ct->status, ARG_TUPLE(orig));
+ IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+ " - unknown expect\n",
+ __func__, ct, ct->status, ARG_TUPLE(orig));
return;
alter:
@@ -247,8 +245,8 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
exp->expectfn = ip_vs_nfct_expect_callback;
- IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&exp->tuple));
+ IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&exp->tuple));
nf_ct_expect_related(exp);
nf_ct_expect_put(exp);
}
@@ -274,26 +272,25 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
tuple.dst.u3 = cp->vaddr;
tuple.dst.u.all = cp->vport;
- IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
- " for conn " FMT_CONN "\n",
- __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n",
+ __func__, ARG_CONN(cp));
h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (nf_ct_kill(ct)) {
- IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple="
- FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&tuple));
+ IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
} else {
- IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
- FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&tuple));
+ IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
}
nf_ct_put(ct);
} else {
- IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
- __func__, ARG_TUPLE(&tuple));
+ IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+ __func__, ARG_TUPLE(&tuple));
}
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index eff7569824e5..3250c4a1111e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
/* Call application helper if needed */
- ret = ip_vs_app_pkt_out(cp, skb);
+ ret = ip_vs_app_pkt_out(cp, skb, iph);
if (ret == 0)
return 0;
/* ret=2: csum update is needed after payload mangling */
@@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
/* Call application helper if needed */
- ret = ip_vs_app_pkt_in(cp, skb);
+ ret = ip_vs_app_pkt_in(cp, skb, iph);
if (ret == 0)
return 0;
/* ret=2: csum update is needed after payload mangling */
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index bcd9b7bde4ee..80d10ad12a15 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
/* Call application helper if needed */
- if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
@@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
* Attempt ip_vs_app call.
* It will fix ip_vs_conn and iph ack_seq stuff
*/
- if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
@@ -436,7 +436,7 @@ static bool tcp_state_active(int state)
return tcp_state_active_table[state];
}
-static struct tcp_states_t tcp_states [] = {
+static struct tcp_states_t tcp_states[] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
@@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = {
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
-static struct tcp_states_t tcp_states_dos [] = {
+static struct tcp_states_t tcp_states_dos[] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index c15ef7c2a1fa..e0ef11c3691e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
/*
* Call application helper if needed
*/
- if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
@@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
* Attempt ip_vs_app call.
* It will fix ip_vs_conn
*/
- if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 16aaac6eedc9..1e01c782583a 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+ return (offset + hash_32(ntohs(port) + ntohl(addr_fold),
+ IP_VS_SH_TAB_BITS)) &
IP_VS_SH_TAB_MASK;
}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 4527921b1c3a..ba0a0fd045c8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -266,12 +266,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
/* check and decrement ttl */
if (ipv6_hdr(skb)->hop_limit <= 1) {
+ struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
+
/* Force OUTPUT device used as source address */
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
ICMPV6_EXC_HOPLIMIT, 0);
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
return false;
}
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 153e690e2893..3b5059a8dcdd 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -79,7 +79,7 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
return memcmp(a, b, klen * sizeof(u32));
}
-static bool add_hlist(struct hlist_head *head,
+bool nf_conncount_add(struct hlist_head *head,
const struct nf_conntrack_tuple *tuple)
{
struct nf_conncount_tuple *conn;
@@ -91,12 +91,12 @@ static bool add_hlist(struct hlist_head *head,
hlist_add_head(&conn->node, head);
return true;
}
+EXPORT_SYMBOL_GPL(nf_conncount_add);
-static unsigned int check_hlist(struct net *net,
- struct hlist_head *head,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone,
- bool *addit)
+unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone,
+ bool *addit)
{
const struct nf_conntrack_tuple_hash *found;
struct nf_conncount_tuple *conn;
@@ -141,6 +141,7 @@ static unsigned int check_hlist(struct net *net,
return length;
}
+EXPORT_SYMBOL_GPL(nf_conncount_lookup);
static void tree_nodes_free(struct rb_root *root,
struct nf_conncount_rb *gc_nodes[],
@@ -187,13 +188,15 @@ count_tree(struct net *net, struct rb_root *root,
} else {
/* same source network -> be counted! */
unsigned int count;
- count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+
+ count = nf_conncount_lookup(net, &rbconn->hhead, tuple,
+ zone, &addit);
tree_nodes_free(root, gc_nodes, gc_count);
if (!addit)
return count;
- if (!add_hlist(&rbconn->hhead, tuple))
+ if (!nf_conncount_add(&rbconn->hhead, tuple))
return 0; /* hotdrop */
return count + 1;
@@ -203,7 +206,7 @@ count_tree(struct net *net, struct rb_root *root,
continue;
/* only used for GC on hhead, retval and 'addit' ignored */
- check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+ nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit);
if (hlist_empty(&rbconn->hhead))
gc_nodes[gc_count++] = rbconn;
}
@@ -303,11 +306,19 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
}
EXPORT_SYMBOL_GPL(nf_conncount_init);
-static void destroy_tree(struct rb_root *r)
+void nf_conncount_cache_free(struct hlist_head *hhead)
{
struct nf_conncount_tuple *conn;
- struct nf_conncount_rb *rbconn;
struct hlist_node *n;
+
+ hlist_for_each_entry_safe(conn, n, hhead, node)
+ kmem_cache_free(conncount_conn_cachep, conn);
+}
+EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
+
+static void destroy_tree(struct rb_root *r)
+{
+ struct nf_conncount_rb *rbconn;
struct rb_node *node;
while ((node = rb_first(r)) != NULL) {
@@ -315,8 +326,7 @@ static void destroy_tree(struct rb_root *r)
rb_erase(node, r);
- hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
- kmem_cache_free(conncount_conn_cachep, conn);
+ nf_conncount_cache_free(&rbconn->hhead);
kmem_cache_free(conncount_rb_cachep, rbconn);
}
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 41ff04ee2554..3465da2a98bd 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -58,11 +58,6 @@
#include "nf_internals.h"
-int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
- enum nf_nat_manip_type manip,
- const struct nlattr *attr) __read_mostly;
-EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
-
__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
EXPORT_SYMBOL_GPL(nf_conntrack_locks);
@@ -186,6 +181,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_t nf_conntrack_generation __read_mostly;
static unsigned int nf_conntrack_hash_rnd __read_mostly;
@@ -1611,6 +1607,82 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
nf_conntrack_get(skb_nfct(nskb));
}
+static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
+{
+ const struct nf_conntrack_l3proto *l3proto;
+ const struct nf_conntrack_l4proto *l4proto;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_tuple tuple;
+ enum ip_conntrack_info ctinfo;
+ struct nf_nat_hook *nat_hook;
+ unsigned int dataoff, status;
+ struct nf_conn *ct;
+ u16 l3num;
+ u8 l4num;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || nf_ct_is_confirmed(ct))
+ return 0;
+
+ l3num = nf_ct_l3num(ct);
+ l3proto = nf_ct_l3proto_find_get(l3num);
+
+ if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
+ &l4num) <= 0)
+ return -1;
+
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
+ l4num, net, &tuple, l3proto, l4proto))
+ return -1;
+
+ if (ct->status & IPS_SRC_NAT) {
+ memcpy(tuple.src.u3.all,
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
+ sizeof(tuple.src.u3.all));
+ tuple.src.u.all =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
+ }
+
+ if (ct->status & IPS_DST_NAT) {
+ memcpy(tuple.dst.u3.all,
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
+ sizeof(tuple.dst.u3.all));
+ tuple.dst.u.all =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
+ }
+
+ h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
+ if (!h)
+ return 0;
+
+ /* Store status bits of the conntrack that is clashing to re-do NAT
+ * mangling according to what it has been done already to this packet.
+ */
+ status = ct->status;
+
+ nf_ct_put(ct);
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ nf_ct_set(skb, ct, ctinfo);
+
+ nat_hook = rcu_dereference(nf_nat_hook);
+ if (!nat_hook)
+ return 0;
+
+ if (status & IPS_SRC_NAT &&
+ nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
+ IP_CT_DIR_ORIGINAL) == NF_DROP)
+ return -1;
+
+ if (status & IPS_DST_NAT &&
+ nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
+ IP_CT_DIR_ORIGINAL) == NF_DROP)
+ return -1;
+
+ return 0;
+}
+
/* Bring out ya dead! */
static struct nf_conn *
get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
@@ -1812,8 +1884,7 @@ void nf_conntrack_cleanup_start(void)
void nf_conntrack_cleanup_end(void)
{
- RCU_INIT_POINTER(nf_ct_destroy, NULL);
-
+ RCU_INIT_POINTER(nf_ct_hook, NULL);
cancel_delayed_work_sync(&conntrack_gc_work.dwork);
nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
@@ -2130,11 +2201,16 @@ err_cachep:
return ret;
}
+static struct nf_ct_hook nf_conntrack_hook = {
+ .update = nf_conntrack_update,
+ .destroy = destroy_conntrack,
+};
+
void nf_conntrack_init_end(void)
{
/* For use by REJECT target */
RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
- RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
+ RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
}
/*
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index f0e9a7511e1a..a11c304fb771 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -566,8 +566,7 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
.timeout = 5 * 60,
};
-/* don't make this __exit, since it's called from __init ! */
-static void nf_conntrack_ftp_fini(void)
+static void __exit nf_conntrack_ftp_fini(void)
{
nf_conntrack_helpers_unregister(ftp, ports_c * 2);
kfree(ftp_buffer);
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 5523acce9d69..4099f4d79bae 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -232,8 +232,6 @@ static int help(struct sk_buff *skb, unsigned int protoff,
static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly;
static struct nf_conntrack_expect_policy irc_exp_policy;
-static void nf_conntrack_irc_fini(void);
-
static int __init nf_conntrack_irc_init(void)
{
int i, ret;
@@ -276,9 +274,7 @@ static int __init nf_conntrack_irc_init(void)
return 0;
}
-/* This function is intentionally _NOT_ defined as __exit, because
- * it is needed by the init function */
-static void nf_conntrack_irc_fini(void)
+static void __exit nf_conntrack_irc_fini(void)
{
nf_conntrack_helpers_unregister(irc, ports_c);
kfree(irc_buffer);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4c1d0c5bc268..39327a42879f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1431,11 +1431,11 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
enum nf_nat_manip_type manip,
const struct nlattr *attr)
{
- typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup;
+ struct nf_nat_hook *nat_hook;
int err;
- parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
- if (!parse_nat_setup) {
+ nat_hook = rcu_dereference(nf_nat_hook);
+ if (!nat_hook) {
#ifdef CONFIG_MODULES
rcu_read_unlock();
nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
@@ -1446,13 +1446,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
}
nfnl_lock(NFNL_SUBSYS_CTNETLINK);
rcu_read_lock();
- if (nfnetlink_parse_nat_setup_hook)
+ if (nat_hook->parse_nat_setup)
return -EAGAIN;
#endif
return -EOPNOTSUPP;
}
- err = parse_nat_setup(ct, manip, attr);
+ err = nat_hook->parse_nat_setup(ct, manip, attr);
if (err == -EAGAIN) {
#ifdef CONFIG_MODULES
rcu_read_unlock();
@@ -2205,6 +2205,9 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
goto nla_put_failure;
+ if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return skb->len;
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index ae457f39d5ce..5072ff96ab33 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -173,8 +173,7 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
.timeout = 5 * 60,
};
-/* don't make this __exit, since it's called from __init ! */
-static void nf_conntrack_sane_fini(void)
+static void __exit nf_conntrack_sane_fini(void)
{
nf_conntrack_helpers_unregister(sane, ports_c * 2);
kfree(sane_buffer);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 908e51e2dc2b..c8d2b6688a2a 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1617,7 +1617,7 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1
},
};
-static void nf_conntrack_sip_fini(void)
+static void __exit nf_conntrack_sip_fini(void)
{
nf_conntrack_helpers_unregister(sip, ports_c * 4);
}
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 0ec6779fd5d9..548b673b3625 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -104,7 +104,7 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = {
.timeout = 5 * 60,
};
-static void nf_conntrack_tftp_fini(void)
+static void __exit nf_conntrack_tftp_fini(void)
{
nf_conntrack_helpers_unregister(tftp, ports_c * 2);
}
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table_core.c
index ec410cae9307..eb0d1658ac05 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -4,6 +4,8 @@
#include <linux/netfilter.h>
#include <linux/rhashtable.h>
#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
@@ -16,6 +18,43 @@ struct flow_offload_entry {
struct rcu_head rcu_head;
};
+static DEFINE_MUTEX(flowtable_lock);
+static LIST_HEAD(flowtables);
+
+static void
+flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
+ struct nf_flow_route *route,
+ enum flow_offload_tuple_dir dir)
+{
+ struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
+ struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+ struct dst_entry *dst = route->tuple[dir].dst;
+
+ ft->dir = dir;
+
+ switch (ctt->src.l3num) {
+ case NFPROTO_IPV4:
+ ft->src_v4 = ctt->src.u3.in;
+ ft->dst_v4 = ctt->dst.u3.in;
+ ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
+ break;
+ case NFPROTO_IPV6:
+ ft->src_v6 = ctt->src.u3.in6;
+ ft->dst_v6 = ctt->dst.u3.in6;
+ ft->mtu = ip6_dst_mtu_forward(dst);
+ break;
+ }
+
+ ft->l3proto = ctt->src.l3num;
+ ft->l4proto = ctt->dst.protonum;
+ ft->src_port = ctt->src.u.tcp.port;
+ ft->dst_port = ctt->dst.u.tcp.port;
+
+ ft->iifidx = route->tuple[dir].ifindex;
+ ft->oifidx = route->tuple[!dir].ifindex;
+ ft->dst_cache = dst;
+}
+
struct flow_offload *
flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
{
@@ -40,69 +79,12 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
entry->ct = ct;
- switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) {
- case NFPROTO_IPV4:
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
- break;
- case NFPROTO_IPV6:
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
- break;
- }
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache =
- route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache =
- route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir =
- FLOW_OFFLOAD_DIR_ORIGINAL;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir =
- FLOW_OFFLOAD_DIR_REPLY;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
- route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx =
- route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
- route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx =
- route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
+ flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
if (ct->status & IPS_SRC_NAT)
flow->flags |= FLOW_OFFLOAD_SNAT;
- else if (ct->status & IPS_DST_NAT)
+ if (ct->status & IPS_DST_NAT)
flow->flags |= FLOW_OFFLOAD_DNAT;
return flow;
@@ -118,6 +100,43 @@ err_ct_refcnt:
}
EXPORT_SYMBOL_GPL(flow_offload_alloc);
+static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
+{
+ tcp->state = TCP_CONNTRACK_ESTABLISHED;
+ tcp->seen[0].td_maxwin = 0;
+ tcp->seen[1].td_maxwin = 0;
+}
+
+static void flow_offload_fixup_ct_state(struct nf_conn *ct)
+{
+ const struct nf_conntrack_l4proto *l4proto;
+ struct net *net = nf_ct_net(ct);
+ unsigned int *timeouts;
+ unsigned int timeout;
+ int l4num;
+
+ l4num = nf_ct_protonum(ct);
+ if (l4num == IPPROTO_TCP)
+ flow_offload_fixup_tcp(&ct->proto.tcp);
+
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num);
+ if (!l4proto)
+ return;
+
+ timeouts = l4proto->get_timeouts(net);
+ if (!timeouts)
+ return;
+
+ if (l4num == IPPROTO_TCP)
+ timeout = timeouts[TCP_CONNTRACK_ESTABLISHED];
+ else if (l4num == IPPROTO_UDP)
+ timeout = timeouts[UDP_CT_REPLIED];
+ else
+ return;
+
+ ct->timeout = nfct_time_stamp + timeout;
+}
+
void flow_offload_free(struct flow_offload *flow)
{
struct flow_offload_entry *e;
@@ -125,17 +144,46 @@ void flow_offload_free(struct flow_offload *flow)
dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
e = container_of(flow, struct flow_offload_entry, flow);
- nf_ct_delete(e->ct, 0, 0);
+ if (flow->flags & FLOW_OFFLOAD_DYING)
+ nf_ct_delete(e->ct, 0, 0);
nf_ct_put(e->ct);
kfree_rcu(e, rcu_head);
}
EXPORT_SYMBOL_GPL(flow_offload_free);
-void flow_offload_dead(struct flow_offload *flow)
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple *tuple = data;
+
+ return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple_rhash *tuplehash = data;
+
+ return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
{
- flow->flags |= FLOW_OFFLOAD_DYING;
+ const struct flow_offload_tuple *tuple = arg->key;
+ const struct flow_offload_tuple_rhash *x = ptr;
+
+ if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+ return 1;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(flow_offload_dead);
+
+static const struct rhashtable_params nf_flow_offload_rhash_params = {
+ .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
+ .hashfn = flow_offload_hash,
+ .obj_hashfn = flow_offload_hash_obj,
+ .obj_cmpfn = flow_offload_hash_cmp,
+ .automatic_shrinking = true,
+};
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
{
@@ -143,10 +191,10 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
return 0;
}
EXPORT_SYMBOL_GPL(flow_offload_add);
@@ -154,22 +202,51 @@ EXPORT_SYMBOL_GPL(flow_offload_add);
static void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
+ struct flow_offload_entry *e;
+
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
+
+ e = container_of(flow, struct flow_offload_entry, flow);
+ clear_bit(IPS_OFFLOAD_BIT, &e->ct->status);
flow_offload_free(flow);
}
+void flow_offload_teardown(struct flow_offload *flow)
+{
+ struct flow_offload_entry *e;
+
+ flow->flags |= FLOW_OFFLOAD_TEARDOWN;
+
+ e = container_of(flow, struct flow_offload_entry, flow);
+ flow_offload_fixup_ct_state(e->ct);
+}
+EXPORT_SYMBOL_GPL(flow_offload_teardown);
+
struct flow_offload_tuple_rhash *
flow_offload_lookup(struct nf_flowtable *flow_table,
struct flow_offload_tuple *tuple)
{
- return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
- *flow_table->type->params);
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct flow_offload *flow;
+ int dir;
+
+ tuplehash = rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
+ nf_flow_offload_rhash_params);
+ if (!tuplehash)
+ return NULL;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))
+ return NULL;
+
+ return tuplehash;
}
EXPORT_SYMBOL_GPL(flow_offload_lookup);
@@ -216,11 +293,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
return (__s32)(flow->timeout - (u32)jiffies) <= 0;
}
-static inline bool nf_flow_is_dying(const struct flow_offload *flow)
-{
- return flow->flags & FLOW_OFFLOAD_DYING;
-}
-
static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
{
struct flow_offload_tuple_rhash *tuplehash;
@@ -248,7 +320,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
if (nf_flow_has_expired(flow) ||
- nf_flow_is_dying(flow))
+ (flow->flags & (FLOW_OFFLOAD_DYING |
+ FLOW_OFFLOAD_TEARDOWN)))
flow_offload_del(flow_table, flow);
}
out:
@@ -258,7 +331,7 @@ out:
return 1;
}
-void nf_flow_offload_work_gc(struct work_struct *work)
+static void nf_flow_offload_work_gc(struct work_struct *work)
{
struct nf_flowtable *flow_table;
@@ -266,42 +339,6 @@ void nf_flow_offload_work_gc(struct work_struct *work)
nf_flow_offload_gc_step(flow_table);
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
-EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
-
-static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
-{
- const struct flow_offload_tuple *tuple = data;
-
- return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
-{
- const struct flow_offload_tuple_rhash *tuplehash = data;
-
- return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
- const void *ptr)
-{
- const struct flow_offload_tuple *tuple = arg->key;
- const struct flow_offload_tuple_rhash *x = ptr;
-
- if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
- return 1;
-
- return 0;
-}
-
-const struct rhashtable_params nf_flow_offload_rhash_params = {
- .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
- .hashfn = flow_offload_hash,
- .obj_hashfn = flow_offload_hash_obj,
- .obj_cmpfn = flow_offload_hash_cmp,
- .automatic_shrinking = true,
-};
-EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
__be16 port, __be16 new_port)
@@ -419,33 +456,69 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
+int nf_flow_table_init(struct nf_flowtable *flowtable)
+{
+ int err;
+
+ INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+
+ err = rhashtable_init(&flowtable->rhashtable,
+ &nf_flow_offload_rhash_params);
+ if (err < 0)
+ return err;
+
+ queue_delayed_work(system_power_efficient_wq,
+ &flowtable->gc_work, HZ);
+
+ mutex_lock(&flowtable_lock);
+ list_add(&flowtable->list, &flowtables);
+ mutex_unlock(&flowtable_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_init);
+
static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
{
struct net_device *dev = data;
- if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+ if (!dev) {
+ flow_offload_teardown(flow);
return;
+ }
- flow_offload_dead(flow);
+ if (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
+ flow->tuplehash[1].tuple.iifidx == dev->ifindex)
+ flow_offload_dead(flow);
}
static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
- void *data)
+ struct net_device *dev)
{
- nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+ nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
flush_delayed_work(&flowtable->gc_work);
}
void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
{
- nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+ struct nf_flowtable *flowtable;
+
+ mutex_lock(&flowtable_lock);
+ list_for_each_entry(flowtable, &flowtables, list)
+ nf_flow_table_iterate_cleanup(flowtable, dev);
+ mutex_unlock(&flowtable_lock);
}
EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
void nf_flow_table_free(struct nf_flowtable *flow_table)
{
+ mutex_lock(&flowtable_lock);
+ list_del(&flow_table->list);
+ mutex_unlock(&flowtable_lock);
+ cancel_delayed_work_sync(&flow_table->gc_work);
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
WARN_ON(!nf_flow_offload_gc_step(flow_table));
+ rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 375a1881d93d..99771aa7e7ea 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -22,8 +22,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
static struct nf_flowtable_type flowtable_inet = {
.family = NFPROTO_INET,
- .params = &nf_flow_offload_rhash_params,
- .gc = nf_flow_offload_work_gc,
+ .init = nf_flow_table_init,
.free = nf_flow_table_free,
.hook = nf_flow_offload_inet_hook,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
new file mode 100644
index 000000000000..15ed91309992
--- /dev/null
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -0,0 +1,489 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/neighbour.h>
+#include <net/netfilter/nf_flow_table.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+static int nf_flow_state_check(struct flow_offload *flow, int proto,
+ struct sk_buff *skb, unsigned int thoff)
+{
+ struct tcphdr *tcph;
+
+ if (proto != IPPROTO_TCP)
+ return 0;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ if (unlikely(tcph->fin || tcph->rst)) {
+ flow_offload_teardown(flow);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace4(&udph->check, skb, addr,
+ new_addr, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, __be32 addr,
+ __be32 new_addr)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ struct iphdr *iph, unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ __be32 addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+ csum_replace4(&iph->check, addr, new_addr);
+
+ return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ struct iphdr *iph, unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ __be32 addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+ csum_replace4(&iph->check, addr, new_addr);
+
+ return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
+ return -1;
+ if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
+ return -1;
+
+ return 0;
+}
+
+static bool ip_has_options(unsigned int thoff)
+{
+ return thoff != sizeof(struct iphdr);
+}
+
+static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
+ struct flow_offload_tuple *tuple)
+{
+ struct flow_ports *ports;
+ unsigned int thoff;
+ struct iphdr *iph;
+
+ if (!pskb_may_pull(skb, sizeof(*iph)))
+ return -1;
+
+ iph = ip_hdr(skb);
+ thoff = iph->ihl * 4;
+
+ if (ip_is_fragment(iph) ||
+ unlikely(ip_has_options(thoff)))
+ return -1;
+
+ if (iph->protocol != IPPROTO_TCP &&
+ iph->protocol != IPPROTO_UDP)
+ return -1;
+
+ thoff = iph->ihl * 4;
+ if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ return -1;
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+ tuple->src_v4.s_addr = iph->saddr;
+ tuple->dst_v4.s_addr = iph->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET;
+ tuple->l4proto = iph->protocol;
+ tuple->iifidx = dev->ifindex;
+
+ return 0;
+}
+
+/* Based on ip_exceeds_mtu(). */
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
+ return false;
+
+ return true;
+}
+
+unsigned int
+nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple tuple = {};
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ struct net_device *outdev;
+ struct rtable *rt;
+ unsigned int thoff;
+ struct iphdr *iph;
+ __be32 nexthop;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return NF_ACCEPT;
+
+ if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+ return NF_ACCEPT;
+
+ tuplehash = flow_offload_lookup(flow_table, &tuple);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+ if (!outdev)
+ return NF_ACCEPT;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+
+ if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
+ (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
+ return NF_ACCEPT;
+
+ if (skb_try_make_writable(skb, sizeof(*iph)))
+ return NF_DROP;
+
+ thoff = ip_hdr(skb)->ihl * 4;
+ if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
+ return NF_ACCEPT;
+
+ if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+ nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
+ return NF_DROP;
+
+ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ iph = ip_hdr(skb);
+ ip_decrease_ttl(iph);
+
+ skb->dev = outdev;
+ nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+ skb_dst_set_noref(skb, &rt->dst);
+ neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+
+ return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
+
+static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
+ new_addr->s6_addr32, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
+ new_addr->s6_addr32, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff, struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ switch (ip6h->nexthdr) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+static int nf_flow_snat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ struct in6_addr addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = ip6h->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
+ ip6h->saddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = ip6h->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
+ ip6h->daddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ struct in6_addr addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = ip6h->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
+ ip6h->daddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = ip6h->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
+ ip6h->saddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_nat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir)
+{
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ unsigned int thoff = sizeof(*ip6h);
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+ nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ return -1;
+ if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+ nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ return -1;
+
+ return 0;
+}
+
+static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
+ struct flow_offload_tuple *tuple)
+{
+ struct flow_ports *ports;
+ struct ipv6hdr *ip6h;
+ unsigned int thoff;
+
+ if (!pskb_may_pull(skb, sizeof(*ip6h)))
+ return -1;
+
+ ip6h = ipv6_hdr(skb);
+
+ if (ip6h->nexthdr != IPPROTO_TCP &&
+ ip6h->nexthdr != IPPROTO_UDP)
+ return -1;
+
+ thoff = sizeof(*ip6h);
+ if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ return -1;
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+ tuple->src_v6 = ip6h->saddr;
+ tuple->dst_v6 = ip6h->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET6;
+ tuple->l4proto = ip6h->nexthdr;
+ tuple->iifidx = dev->ifindex;
+
+ return 0;
+}
+
+unsigned int
+nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple tuple = {};
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ struct net_device *outdev;
+ struct in6_addr *nexthop;
+ struct ipv6hdr *ip6h;
+ struct rt6_info *rt;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return NF_ACCEPT;
+
+ if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
+ return NF_ACCEPT;
+
+ tuplehash = flow_offload_lookup(flow_table, &tuple);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+ if (!outdev)
+ return NF_ACCEPT;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
+
+ if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ return NF_ACCEPT;
+
+ if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb,
+ sizeof(*ip6h)))
+ return NF_ACCEPT;
+
+ if (skb_try_make_writable(skb, sizeof(*ip6h)))
+ return NF_DROP;
+
+ if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+ nf_flow_nat_ipv6(flow, skb, dir) < 0)
+ return NF_DROP;
+
+ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ ip6h = ipv6_hdr(skb);
+ ip6h->hop_limit--;
+
+ skb->dev = outdev;
+ nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+ skb_dst_set_noref(skb, &rt->dst);
+ neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+
+ return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 18f6d7ae995b..e15779fd58e3 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -15,4 +15,9 @@ void nf_queue_nf_hook_drop(struct net *net);
/* nf_log.c */
int __init netfilter_log_init(void);
+/* core.c */
+void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
+ const struct nf_hook_ops *reg);
+int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
+ const struct nf_hook_ops *reg);
#endif
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 617693ff9f4c..b7df32a56e7e 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -32,6 +32,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_nat.h>
+#include "nf_internals.h"
+
static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -39,11 +41,27 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
__read_mostly;
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly;
+static unsigned int nat_net_id __read_mostly;
static struct hlist_head *nf_nat_bysource __read_mostly;
static unsigned int nf_nat_htable_size __read_mostly;
static unsigned int nf_nat_hash_rnd __read_mostly;
+struct nf_nat_lookup_hook_priv {
+ struct nf_hook_entries __rcu *entries;
+
+ struct rcu_head rcu_head;
+};
+
+struct nf_nat_hooks_net {
+ struct nf_hook_ops *nat_hook_ops;
+ unsigned int users;
+};
+
+struct nat_net {
+ struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO];
+};
+
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
{
@@ -157,7 +175,7 @@ EXPORT_SYMBOL(nf_nat_used_tuple);
static int in_range(const struct nf_nat_l3proto *l3proto,
const struct nf_nat_l4proto *l4proto,
const struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range)
+ const struct nf_nat_range2 *range)
{
/* If we are supposed to map IPs, then we must be in the
* range specified, otherwise let this drag us onto a new src IP.
@@ -194,7 +212,7 @@ find_appropriate_src(struct net *net,
const struct nf_nat_l4proto *l4proto,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *result,
- const struct nf_nat_range *range)
+ const struct nf_nat_range2 *range)
{
unsigned int h = hash_by_src(net, tuple);
const struct nf_conn *ct;
@@ -224,7 +242,7 @@ find_appropriate_src(struct net *net,
static void
find_best_ips_proto(const struct nf_conntrack_zone *zone,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
const struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
@@ -298,7 +316,7 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig_tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
@@ -349,9 +367,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
/* Only bother mapping if it's not already in range and unique */
if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
- if (l4proto->in_range(tuple, maniptype,
- &range->min_proto,
- &range->max_proto) &&
+ if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
+ l4proto->in_range(tuple, maniptype,
+ &range->min_proto,
+ &range->max_proto) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
goto out;
@@ -360,7 +379,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
}
}
- /* Last change: get protocol to try to obtain unique tuple. */
+ /* Last chance: get protocol to try to obtain unique tuple. */
l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
out:
rcu_read_unlock();
@@ -381,7 +400,7 @@ EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype)
{
struct net *net = nf_ct_net(ct);
@@ -459,7 +478,7 @@ __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
(manip == NF_NAT_MANIP_SRC ?
ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
- struct nf_nat_range range = {
+ struct nf_nat_range2 range = {
.flags = NF_NAT_RANGE_MAP_IPS,
.min_addr = ip,
.max_addr = ip,
@@ -474,17 +493,36 @@ nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
}
EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
+static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
+ enum nf_nat_manip_type mtype,
+ enum ip_conntrack_dir dir)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_nat_l4proto *l4proto;
+ struct nf_conntrack_tuple target;
+
+ /* We are aiming to look like inverse of other direction. */
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+
+ l3proto = __nf_nat_l3proto_find(target.src.l3num);
+ l4proto = __nf_nat_l4proto_find(target.src.l3num,
+ target.dst.protonum);
+ if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
+ return NF_DROP;
+
+ return NF_ACCEPT;
+}
+
/* Do packet manipulations according to nf_nat_setup_info. */
unsigned int nf_nat_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum,
struct sk_buff *skb)
{
- const struct nf_nat_l3proto *l3proto;
- const struct nf_nat_l4proto *l4proto;
+ enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ unsigned int verdict = NF_ACCEPT;
unsigned long statusbit;
- enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
if (mtype == NF_NAT_MANIP_SRC)
statusbit = IPS_SRC_NAT;
@@ -496,21 +534,87 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
statusbit ^= IPS_NAT_MASK;
/* Non-atomic: these bits don't change. */
- if (ct->status & statusbit) {
- struct nf_conntrack_tuple target;
+ if (ct->status & statusbit)
+ verdict = nf_nat_manip_pkt(skb, ct, mtype, dir);
+
+ return verdict;
+}
+EXPORT_SYMBOL_GPL(nf_nat_packet);
+
+unsigned int
+nf_nat_inet_fn(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_nat *nat;
+ /* maniptype == SRC for postrouting. */
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
+
+ ct = nf_ct_get(skb, &ctinfo);
+ /* Can't track? It's not due to stress, or conntrack would
+ * have dropped it. Hence it's the user's responsibilty to
+ * packet filter it out, or implement conntrack/NAT for that
+ * protocol. 8) --RR
+ */
+ if (!ct)
+ return NF_ACCEPT;
- /* We are aiming to look like inverse of other direction. */
- nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+ nat = nfct_nat(ct);
- l3proto = __nf_nat_l3proto_find(target.src.l3num);
- l4proto = __nf_nat_l4proto_find(target.src.l3num,
- target.dst.protonum);
- if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
- return NF_DROP;
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ struct nf_nat_lookup_hook_priv *lpriv = priv;
+ struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
+ unsigned int ret;
+ int i;
+
+ if (!e)
+ goto null_bind;
+
+ for (i = 0; i < e->num_hook_entries; i++) {
+ ret = e->hooks[i].hook(e->hooks[i].priv, skb,
+ state);
+ if (ret != NF_ACCEPT)
+ return ret;
+ if (nf_nat_initialized(ct, maniptype))
+ goto do_nat;
+ }
+null_bind:
+ ret = nf_nat_alloc_null_binding(ct, state->hook);
+ if (ret != NF_ACCEPT)
+ return ret;
+ } else {
+ pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
+ maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+ ct, ct->status);
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat,
+ state->out))
+ goto oif_changed;
+ }
+ break;
+ default:
+ /* ESTABLISHED */
+ WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY);
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
+ goto oif_changed;
}
- return NF_ACCEPT;
+do_nat:
+ return nf_nat_packet(ct, ctinfo, state->hook, skb);
+
+oif_changed:
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_DROP;
}
-EXPORT_SYMBOL_GPL(nf_nat_packet);
+EXPORT_SYMBOL_GPL(nf_nat_inet_fn);
struct nf_nat_proto_clean {
u8 l3proto;
@@ -702,7 +806,7 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
static int nfnetlink_parse_nat_proto(struct nlattr *attr,
const struct nf_conn *ct,
- struct nf_nat_range *range)
+ struct nf_nat_range2 *range)
{
struct nlattr *tb[CTA_PROTONAT_MAX+1];
const struct nf_nat_l4proto *l4proto;
@@ -730,7 +834,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
static int
nfnetlink_parse_nat(const struct nlattr *nat,
- const struct nf_conn *ct, struct nf_nat_range *range,
+ const struct nf_conn *ct, struct nf_nat_range2 *range,
const struct nf_nat_l3proto *l3proto)
{
struct nlattr *tb[CTA_NAT_MAX+1];
@@ -758,7 +862,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
enum nf_nat_manip_type manip,
const struct nlattr *attr)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
const struct nf_nat_l3proto *l3proto;
int err;
@@ -800,6 +904,146 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
.expectfn = nf_nat_follow_master,
};
+int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
+ const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
+{
+ struct nat_net *nat_net = net_generic(net, nat_net_id);
+ struct nf_nat_hooks_net *nat_proto_net;
+ struct nf_nat_lookup_hook_priv *priv;
+ unsigned int hooknum = ops->hooknum;
+ struct nf_hook_ops *nat_ops;
+ int i, ret;
+
+ if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
+ return -EINVAL;
+
+ nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+
+ for (i = 0; i < ops_count; i++) {
+ if (WARN_ON(orig_nat_ops[i].pf != ops->pf))
+ return -EINVAL;
+ if (orig_nat_ops[i].hooknum == hooknum) {
+ hooknum = i;
+ break;
+ }
+ }
+
+ if (WARN_ON_ONCE(i == ops_count))
+ return -EINVAL;
+
+ mutex_lock(&nf_nat_proto_mutex);
+ if (!nat_proto_net->nat_hook_ops) {
+ WARN_ON(nat_proto_net->users != 0);
+
+ nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
+ if (!nat_ops) {
+ mutex_unlock(&nf_nat_proto_mutex);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ops_count; i++) {
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv) {
+ nat_ops[i].priv = priv;
+ continue;
+ }
+ mutex_unlock(&nf_nat_proto_mutex);
+ while (i)
+ kfree(nat_ops[--i].priv);
+ kfree(nat_ops);
+ return -ENOMEM;
+ }
+
+ ret = nf_register_net_hooks(net, nat_ops, ops_count);
+ if (ret < 0) {
+ mutex_unlock(&nf_nat_proto_mutex);
+ for (i = 0; i < ops_count; i++)
+ kfree(nat_ops[i].priv);
+ kfree(nat_ops);
+ return ret;
+ }
+
+ nat_proto_net->nat_hook_ops = nat_ops;
+ }
+
+ nat_ops = nat_proto_net->nat_hook_ops;
+ priv = nat_ops[hooknum].priv;
+ if (WARN_ON_ONCE(!priv)) {
+ mutex_unlock(&nf_nat_proto_mutex);
+ return -EOPNOTSUPP;
+ }
+
+ ret = nf_hook_entries_insert_raw(&priv->entries, ops);
+ if (ret == 0)
+ nat_proto_net->users++;
+
+ mutex_unlock(&nf_nat_proto_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_register_fn);
+
+void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
+ unsigned int ops_count)
+{
+ struct nat_net *nat_net = net_generic(net, nat_net_id);
+ struct nf_nat_hooks_net *nat_proto_net;
+ struct nf_nat_lookup_hook_priv *priv;
+ struct nf_hook_ops *nat_ops;
+ int hooknum = ops->hooknum;
+ int i;
+
+ if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))
+ return;
+
+ nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+
+ mutex_lock(&nf_nat_proto_mutex);
+ if (WARN_ON(nat_proto_net->users == 0))
+ goto unlock;
+
+ nat_proto_net->users--;
+
+ nat_ops = nat_proto_net->nat_hook_ops;
+ for (i = 0; i < ops_count; i++) {
+ if (nat_ops[i].hooknum == hooknum) {
+ hooknum = i;
+ break;
+ }
+ }
+ if (WARN_ON_ONCE(i == ops_count))
+ goto unlock;
+ priv = nat_ops[hooknum].priv;
+ nf_hook_entries_delete_raw(&priv->entries, ops);
+
+ if (nat_proto_net->users == 0) {
+ nf_unregister_net_hooks(net, nat_ops, ops_count);
+
+ for (i = 0; i < ops_count; i++) {
+ priv = nat_ops[i].priv;
+ kfree_rcu(priv, rcu_head);
+ }
+
+ nat_proto_net->nat_hook_ops = NULL;
+ kfree(nat_ops);
+ }
+unlock:
+ mutex_unlock(&nf_nat_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_nat_unregister_fn);
+
+static struct pernet_operations nat_net_ops = {
+ .id = &nat_net_id,
+ .size = sizeof(struct nat_net),
+};
+
+static struct nf_nat_hook nat_hook = {
+ .parse_nat_setup = nfnetlink_parse_nat_setup,
+#ifdef CONFIG_XFRM
+ .decode_session = __nf_nat_decode_session,
+#endif
+ .manip_pkt = nf_nat_manip_pkt,
+};
+
static int __init nf_nat_init(void)
{
int ret, i;
@@ -823,15 +1067,17 @@ static int __init nf_nat_init(void)
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_nat_locks[i]);
+ ret = register_pernet_subsys(&nat_net_ops);
+ if (ret < 0) {
+ nf_ct_extend_unregister(&nat_extend);
+ return ret;
+ }
+
nf_ct_helper_expectfn_register(&follow_master_nat);
- BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
- RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
- nfnetlink_parse_nat_setup);
-#ifdef CONFIG_XFRM
- BUG_ON(nf_nat_decode_session_hook != NULL);
- RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
-#endif
+ WARN_ON(nf_nat_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
+
return 0;
}
@@ -844,16 +1090,15 @@ static void __exit nf_nat_cleanup(void)
nf_ct_extend_unregister(&nat_extend);
nf_ct_helper_expectfn_unregister(&follow_master_nat);
- RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
-#ifdef CONFIG_XFRM
- RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
-#endif
+ RCU_INIT_POINTER(nf_nat_hook, NULL);
+
synchronize_rcu();
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
synchronize_net();
nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+ unregister_pernet_subsys(&nat_net_ops);
}
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 607a373379b4..99606baedda4 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
void nf_nat_follow_master(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
/* This must be a fresh one. */
BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
index 7d7466dbf663..5d849d835561 100644
--- a/net/netfilter/nf_nat_proto_common.c
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range);
void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct,
u16 *rover)
@@ -83,6 +83,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
: tuple->src.u.all);
} else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
off = prandom_u32();
+ } else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) {
+ off = (ntohs(*portptr) - ntohs(range->base_proto.all));
} else {
off = *rover;
}
@@ -91,7 +93,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
*portptr = htons(min + off % range_size);
if (++i != range_size && nf_nat_used_tuple(tuple, ct))
continue;
- if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
+ if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL|
+ NF_NAT_RANGE_PROTO_OFFSET)))
*rover = off;
return;
}
@@ -100,7 +103,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
- struct nf_nat_range *range)
+ struct nf_nat_range2 *range)
{
if (tb[CTA_PROTONAT_PORT_MIN]) {
range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index 269fcd5dc34c..67ea0d83aa5a 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -23,7 +23,7 @@ static u_int16_t dccp_port_rover;
static void
dccp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index c57ee3240b1d..1c5d9b65fbba 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -17,7 +17,7 @@ static u_int16_t nf_sctp_port_rover;
static void
sctp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
index 4f8820fc5148..f15fcd475f98 100644
--- a/net/netfilter/nf_nat_proto_tcp.c
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -23,7 +23,7 @@ static u16 tcp_port_rover;
static void
tcp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index edd4a77dc09a..5790f70a83b2 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -22,7 +22,7 @@ static u16 udp_port_rover;
static void
udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
@@ -100,7 +100,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb,
static void
udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c
index 6e494d584412..c5db3e251232 100644
--- a/net/netfilter/nf_nat_proto_unknown.c
+++ b/net/netfilter/nf_nat_proto_unknown.c
@@ -27,7 +27,7 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 25b06b959118..adee04af8d43 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -15,7 +15,6 @@
#include <linux/inetdevice.h>
#include <linux/ip.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
#include <linux/types.h>
@@ -36,7 +35,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
__be32 newdst;
- struct nf_nat_range newrange;
+ struct nf_nat_range2 newrange;
WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
hooknum != NF_INET_LOCAL_OUT);
@@ -82,10 +81,10 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
unsigned int
-nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
unsigned int hooknum)
{
- struct nf_nat_range newrange;
+ struct nf_nat_range2 newrange;
struct in6_addr newdst;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -124,6 +123,3 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
}
EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 791fac4fd745..1f3086074981 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -316,7 +316,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
static void nf_nat_sip_expected(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
/* This must be a fresh one. */
BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
new file mode 100644
index 000000000000..5ba5c7bef2f9
--- /dev/null
+++ b/net/netfilter/nf_osf.c
@@ -0,0 +1,218 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/capability.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netfilter/nf_osf.h>
+
+static inline int nf_osf_ttl(const struct sk_buff *skb,
+ const struct nf_osf_info *info,
+ unsigned char f_ttl)
+{
+ const struct iphdr *ip = ip_hdr(skb);
+
+ if (info->flags & NF_OSF_TTL) {
+ if (info->ttl == NF_OSF_TTL_TRUE)
+ return ip->ttl == f_ttl;
+ if (info->ttl == NF_OSF_TTL_NOCHECK)
+ return 1;
+ else if (ip->ttl <= f_ttl)
+ return 1;
+ else {
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ int ret = 0;
+
+ for_ifa(in_dev) {
+ if (inet_ifa_match(ip->saddr, ifa)) {
+ ret = (ip->ttl == f_ttl);
+ break;
+ }
+ }
+ endfor_ifa(in_dev);
+
+ return ret;
+ }
+ }
+
+ return ip->ttl == f_ttl;
+}
+
+bool
+nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+ int hooknum, struct net_device *in, struct net_device *out,
+ const struct nf_osf_info *info, struct net *net,
+ const struct list_head *nf_osf_fingers)
+{
+ const unsigned char *optp = NULL, *_optp = NULL;
+ unsigned int optsize = 0, check_WSS = 0;
+ int fmatch = FMATCH_WRONG, fcount = 0;
+ const struct iphdr *ip = ip_hdr(skb);
+ const struct nf_osf_user_finger *f;
+ unsigned char opts[MAX_IPOPTLEN];
+ const struct nf_osf_finger *kf;
+ u16 window, totlen, mss = 0;
+ const struct tcphdr *tcp;
+ struct tcphdr _tcph;
+ bool df;
+
+ tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+ if (!tcp)
+ return false;
+
+ if (!tcp->syn)
+ return false;
+
+ totlen = ntohs(ip->tot_len);
+ df = ntohs(ip->frag_off) & IP_DF;
+ window = ntohs(tcp->window);
+
+ if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+ optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+ _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+ sizeof(struct tcphdr), optsize, opts);
+ }
+
+ list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
+ int foptsize, optnum;
+
+ f = &kf->finger;
+
+ if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
+ continue;
+
+ optp = _optp;
+ fmatch = FMATCH_WRONG;
+
+ if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl))
+ continue;
+
+ /*
+ * Should not happen if userspace parser was written correctly.
+ */
+ if (f->wss.wc >= OSF_WSS_MAX)
+ continue;
+
+ /* Check options */
+
+ foptsize = 0;
+ for (optnum = 0; optnum < f->opt_num; ++optnum)
+ foptsize += f->opt[optnum].length;
+
+ if (foptsize > MAX_IPOPTLEN ||
+ optsize > MAX_IPOPTLEN ||
+ optsize != foptsize)
+ continue;
+
+ check_WSS = f->wss.wc;
+
+ for (optnum = 0; optnum < f->opt_num; ++optnum) {
+ if (f->opt[optnum].kind == (*optp)) {
+ __u32 len = f->opt[optnum].length;
+ const __u8 *optend = optp + len;
+
+ fmatch = FMATCH_OK;
+
+ switch (*optp) {
+ case OSFOPT_MSS:
+ mss = optp[3];
+ mss <<= 8;
+ mss |= optp[2];
+
+ mss = ntohs((__force __be16)mss);
+ break;
+ case OSFOPT_TS:
+ break;
+ }
+
+ optp = optend;
+ } else
+ fmatch = FMATCH_OPT_WRONG;
+
+ if (fmatch != FMATCH_OK)
+ break;
+ }
+
+ if (fmatch != FMATCH_OPT_WRONG) {
+ fmatch = FMATCH_WRONG;
+
+ switch (check_WSS) {
+ case OSF_WSS_PLAIN:
+ if (f->wss.val == 0 || window == f->wss.val)
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MSS:
+ /*
+ * Some smart modems decrease mangle MSS to
+ * SMART_MSS_2, so we check standard, decreased
+ * and the one provided in the fingerprint MSS
+ * values.
+ */
+#define SMART_MSS_1 1460
+#define SMART_MSS_2 1448
+ if (window == f->wss.val * mss ||
+ window == f->wss.val * SMART_MSS_1 ||
+ window == f->wss.val * SMART_MSS_2)
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MTU:
+ if (window == f->wss.val * (mss + 40) ||
+ window == f->wss.val * (SMART_MSS_1 + 40) ||
+ window == f->wss.val * (SMART_MSS_2 + 40))
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MODULO:
+ if ((window % f->wss.val) == 0)
+ fmatch = FMATCH_OK;
+ break;
+ }
+ }
+
+ if (fmatch != FMATCH_OK)
+ continue;
+
+ fcount++;
+
+ if (info->flags & NF_OSF_LOG)
+ nf_log_packet(net, family, hooknum, skb,
+ in, out, NULL,
+ "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
+ f->genre, f->version, f->subtype,
+ &ip->saddr, ntohs(tcp->source),
+ &ip->daddr, ntohs(tcp->dest),
+ f->ttl - ip->ttl);
+
+ if ((info->flags & NF_OSF_LOG) &&
+ info->loglevel == NF_OSF_LOGLEVEL_FIRST)
+ break;
+ }
+
+ if (!fcount && (info->flags & NF_OSF_LOG))
+ nf_log_packet(net, family, hooknum, skb, in, out, NULL,
+ "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
+ &ip->saddr, ntohs(tcp->source),
+ &ip->daddr, ntohs(tcp->dest));
+
+ if (fcount)
+ fmatch = FMATCH_OK;
+
+ return fmatch == FMATCH_OK;
+}
+EXPORT_SYMBOL_GPL(nf_osf_match);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 501e48a7965b..ca4c4d994ddb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -28,6 +28,42 @@ static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
static u64 table_handle;
+enum {
+ NFT_VALIDATE_SKIP = 0,
+ NFT_VALIDATE_NEED,
+ NFT_VALIDATE_DO,
+};
+
+static u32 nft_chain_hash(const void *data, u32 len, u32 seed);
+static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed);
+static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *);
+
+static const struct rhashtable_params nft_chain_ht_params = {
+ .head_offset = offsetof(struct nft_chain, rhlhead),
+ .key_offset = offsetof(struct nft_chain, name),
+ .hashfn = nft_chain_hash,
+ .obj_hashfn = nft_chain_hash_obj,
+ .obj_cmpfn = nft_chain_hash_cmp,
+ .locks_mul = 1,
+ .automatic_shrinking = true,
+};
+
+static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+{
+ switch (net->nft.validate_state) {
+ case NFT_VALIDATE_SKIP:
+ WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
+ break;
+ case NFT_VALIDATE_NEED:
+ break;
+ case NFT_VALIDATE_DO:
+ if (new_validate_state == NFT_VALIDATE_NEED)
+ return;
+ }
+
+ net->nft.validate_state = new_validate_state;
+}
+
static void nft_ctx_init(struct nft_ctx *ctx,
struct net *net,
const struct sk_buff *skb,
@@ -74,88 +110,43 @@ static void nft_trans_destroy(struct nft_trans *trans)
kfree(trans);
}
-/* removal requests are queued in the commit_list, but not acted upon
- * until after all new rules are in place.
- *
- * Therefore, nf_register_net_hook(net, &nat_hook) runs before pending
- * nf_unregister_net_hook().
- *
- * nf_register_net_hook thus fails if a nat hook is already in place
- * even if the conflicting hook is about to be removed.
- *
- * If collision is detected, search commit_log for DELCHAIN matching
- * the new nat hooknum; if we find one collision is temporary:
- *
- * Either transaction is aborted (new/colliding hook is removed), or
- * transaction is committed (old hook is removed).
- */
-static bool nf_tables_allow_nat_conflict(const struct net *net,
- const struct nf_hook_ops *ops)
-{
- const struct nft_trans *trans;
- bool ret = false;
-
- if (!ops->nat_hook)
- return false;
-
- list_for_each_entry(trans, &net->nft.commit_list, list) {
- const struct nf_hook_ops *pending_ops;
- const struct nft_chain *pending;
-
- if (trans->msg_type != NFT_MSG_NEWCHAIN &&
- trans->msg_type != NFT_MSG_DELCHAIN)
- continue;
-
- pending = trans->ctx.chain;
- if (!nft_is_base_chain(pending))
- continue;
-
- pending_ops = &nft_base_chain(pending)->ops;
- if (pending_ops->nat_hook &&
- pending_ops->pf == ops->pf &&
- pending_ops->hooknum == ops->hooknum) {
- /* other hook registration already pending? */
- if (trans->msg_type == NFT_MSG_NEWCHAIN)
- return false;
-
- ret = true;
- }
- }
-
- return ret;
-}
-
static int nf_tables_register_hook(struct net *net,
const struct nft_table *table,
struct nft_chain *chain)
{
- struct nf_hook_ops *ops;
- int ret;
+ const struct nft_base_chain *basechain;
+ const struct nf_hook_ops *ops;
if (table->flags & NFT_TABLE_F_DORMANT ||
!nft_is_base_chain(chain))
return 0;
- ops = &nft_base_chain(chain)->ops;
- ret = nf_register_net_hook(net, ops);
- if (ret == -EBUSY && nf_tables_allow_nat_conflict(net, ops)) {
- ops->nat_hook = false;
- ret = nf_register_net_hook(net, ops);
- ops->nat_hook = true;
- }
+ basechain = nft_base_chain(chain);
+ ops = &basechain->ops;
- return ret;
+ if (basechain->type->ops_register)
+ return basechain->type->ops_register(net, ops);
+
+ return nf_register_net_hook(net, ops);
}
static void nf_tables_unregister_hook(struct net *net,
const struct nft_table *table,
struct nft_chain *chain)
{
+ const struct nft_base_chain *basechain;
+ const struct nf_hook_ops *ops;
+
if (table->flags & NFT_TABLE_F_DORMANT ||
!nft_is_base_chain(chain))
return;
+ basechain = nft_base_chain(chain);
+ ops = &basechain->ops;
+
+ if (basechain->type->ops_unregister)
+ return basechain->type->ops_unregister(net, ops);
- nf_unregister_net_hook(net, &nft_base_chain(chain)->ops);
+ nf_unregister_net_hook(net, ops);
}
static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -415,13 +406,17 @@ static struct nft_table *nft_table_lookup(const struct net *net,
{
struct nft_table *table;
- list_for_each_entry(table, &net->nft.tables, list) {
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ list_for_each_entry_rcu(table, &net->nft.tables, list) {
if (!nla_strcmp(nla, table->name) &&
table->family == family &&
nft_active_genmask(table, genmask))
return table;
}
- return NULL;
+
+ return ERR_PTR(-ENOENT);
}
static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
@@ -435,37 +430,6 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
nft_active_genmask(table, genmask))
return table;
}
- return NULL;
-}
-
-static struct nft_table *nf_tables_table_lookup(const struct net *net,
- const struct nlattr *nla,
- u8 family, u8 genmask)
-{
- struct nft_table *table;
-
- if (nla == NULL)
- return ERR_PTR(-EINVAL);
-
- table = nft_table_lookup(net, nla, family, genmask);
- if (table != NULL)
- return table;
-
- return ERR_PTR(-ENOENT);
-}
-
-static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net,
- const struct nlattr *nla,
- u8 genmask)
-{
- struct nft_table *table;
-
- if (nla == NULL)
- return ERR_PTR(-EINVAL);
-
- table = nft_table_lookup_byhandle(net, nla, genmask);
- if (table != NULL)
- return table;
return ERR_PTR(-ENOENT);
}
@@ -618,6 +582,24 @@ done:
return skb->len;
}
+static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct netlink_dump_control *c)
+{
+ int err;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+
+ rcu_read_unlock();
+ err = netlink_dump_start(nlsk, skb, nlh, c);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ return err;
+}
+
+/* called with rcu_read_lock held */
static int nf_tables_gettable(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -633,16 +615,19 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_tables,
+ .module = THIS_MODULE,
};
- return netlink_dump_start(nlsk, skb, nlh, &c);
+
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
- table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]);
return PTR_ERR(table);
+ }
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
@@ -749,6 +734,29 @@ err:
return ret;
}
+static u32 nft_chain_hash(const void *data, u32 len, u32 seed)
+{
+ const char *name = data;
+
+ return jhash(name, strlen(name), seed);
+}
+
+static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct nft_chain *chain = data;
+
+ return nft_chain_hash(chain->name, 0, seed);
+}
+
+static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct nft_chain *chain = ptr;
+ const char *name = arg->key;
+
+ return strcmp(chain->name, name);
+}
+
static int nf_tables_newtable(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -756,21 +764,23 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
- const struct nlattr *name;
- struct nft_table *table;
int family = nfmsg->nfgen_family;
+ const struct nlattr *attr;
+ struct nft_table *table;
u32 flags = 0;
struct nft_ctx ctx;
int err;
- name = nla[NFTA_TABLE_NAME];
- table = nf_tables_table_lookup(net, name, family, genmask);
+ attr = nla[NFTA_TABLE_NAME];
+ table = nft_table_lookup(net, attr, family, genmask);
if (IS_ERR(table)) {
if (PTR_ERR(table) != -ENOENT)
return PTR_ERR(table);
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
+ }
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
@@ -789,10 +799,14 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
if (table == NULL)
goto err_kzalloc;
- table->name = nla_strdup(name, GFP_KERNEL);
+ table->name = nla_strdup(attr, GFP_KERNEL);
if (table->name == NULL)
goto err_strdup;
+ err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
+ if (err)
+ goto err_chain_ht;
+
INIT_LIST_HEAD(&table->chains);
INIT_LIST_HEAD(&table->sets);
INIT_LIST_HEAD(&table->objects);
@@ -809,6 +823,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
list_add_tail_rcu(&table->list, &net->nft.tables);
return 0;
err_trans:
+ rhltable_destroy(&table->chains_ht);
+err_chain_ht:
kfree(table->name);
err_strdup:
kfree(table);
@@ -912,8 +928,9 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
- struct nft_table *table;
int family = nfmsg->nfgen_family;
+ const struct nlattr *attr;
+ struct nft_table *table;
struct nft_ctx ctx;
nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
@@ -921,16 +938,18 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
(!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
return nft_flush(&ctx, family);
- if (nla[NFTA_TABLE_HANDLE])
- table = nf_tables_table_lookup_byhandle(net,
- nla[NFTA_TABLE_HANDLE],
- genmask);
- else
- table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME],
- family, genmask);
+ if (nla[NFTA_TABLE_HANDLE]) {
+ attr = nla[NFTA_TABLE_HANDLE];
+ table = nft_table_lookup_byhandle(net, attr, genmask);
+ } else {
+ attr = nla[NFTA_TABLE_NAME];
+ table = nft_table_lookup(net, attr, family, genmask);
+ }
- if (IS_ERR(table))
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(table);
+ }
if (nlh->nlmsg_flags & NLM_F_NONREC &&
table->use > 0)
@@ -946,6 +965,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
{
BUG_ON(ctx->table->use > 0);
+ rhltable_destroy(&ctx->table->chains_ht);
kfree(ctx->table->name);
kfree(ctx->table);
}
@@ -978,8 +998,7 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
*/
static struct nft_chain *
-nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
- u8 genmask)
+nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
{
struct nft_chain *chain;
@@ -992,22 +1011,35 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
return ERR_PTR(-ENOENT);
}
-static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
- const struct nlattr *nla,
- u8 genmask)
+static struct nft_chain *nft_chain_lookup(struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
{
+ char search[NFT_CHAIN_MAXNAMELEN + 1];
+ struct rhlist_head *tmp, *list;
struct nft_chain *chain;
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry(chain, &table->chains, list) {
- if (!nla_strcmp(nla, chain->name) &&
- nft_active_genmask(chain, genmask))
- return chain;
- }
+ nla_strlcpy(search, nla, sizeof(search));
- return ERR_PTR(-ENOENT);
+ WARN_ON(!rcu_read_lock_held() &&
+ !lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+
+ chain = ERR_PTR(-ENOENT);
+ rcu_read_lock();
+ list = rhltable_lookup(&table->chains_ht, search, nft_chain_ht_params);
+ if (!list)
+ goto out_unlock;
+
+ rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) {
+ if (nft_active_genmask(chain, genmask))
+ goto out_unlock;
+ }
+ chain = ERR_PTR(-ENOENT);
+out_unlock:
+ rcu_read_unlock();
+ return chain;
}
static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
@@ -1203,6 +1235,7 @@ done:
return skb->len;
}
+/* called with rcu_read_lock held */
static int nf_tables_getchain(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -1210,8 +1243,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_cur(net);
- const struct nft_table *table;
const struct nft_chain *chain;
+ struct nft_table *table;
struct sk_buff *skb2;
int family = nfmsg->nfgen_family;
int err;
@@ -1219,20 +1252,25 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_chains,
+ .module = THIS_MODULE,
};
- return netlink_dump_start(nlsk, skb, nlh, &c);
+
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
- table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
+ }
- chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
- if (IS_ERR(chain))
+ chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
return PTR_ERR(chain);
+ }
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
@@ -1304,17 +1342,32 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
}
}
+static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
+{
+ struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
+ struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
+
+ if (g0 != g1)
+ kvfree(g1);
+ kvfree(g0);
+
+ /* should be NULL either via abort or via successful commit */
+ WARN_ON_ONCE(chain->rules_next);
+ kvfree(chain->rules_next);
+}
+
static void nf_tables_chain_destroy(struct nft_ctx *ctx)
{
struct nft_chain *chain = ctx->chain;
BUG_ON(chain->use > 0);
+ /* no concurrent access possible anymore */
+ nf_tables_chain_free_chain_rules(chain);
+
if (nft_is_base_chain(chain)) {
struct nft_base_chain *basechain = nft_base_chain(chain);
- if (basechain->type->free)
- basechain->type->free(ctx);
module_put(basechain->type->owner);
free_percpu(basechain->stats);
if (basechain->stats)
@@ -1404,6 +1457,27 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
module_put(hook->type->owner);
}
+struct nft_rules_old {
+ struct rcu_head h;
+ struct nft_rule **start;
+};
+
+static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
+ unsigned int alloc)
+{
+ if (alloc > INT_MAX)
+ return NULL;
+
+ alloc += 1; /* NULL, ends rules */
+ if (sizeof(struct nft_rule *) > INT_MAX / alloc)
+ return NULL;
+
+ alloc *= sizeof(struct nft_rule *);
+ alloc += sizeof(struct nft_rules_old);
+
+ return kvmalloc(alloc, GFP_KERNEL);
+}
+
static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
u8 policy, bool create)
{
@@ -1413,6 +1487,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
struct nft_stats __percpu *stats;
struct net *net = ctx->net;
struct nft_chain *chain;
+ struct nft_rule **rules;
int err;
if (table->use == UINT_MAX)
@@ -1447,9 +1522,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
}
basechain->type = hook.type;
- if (basechain->type->init)
- basechain->type->init(ctx);
-
chain = &basechain->chain;
ops = &basechain->ops;
@@ -1460,9 +1532,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
ops->hook = hook.type->hooks[ops->hooknum];
ops->dev = hook.dev;
- if (basechain->type->type == NFT_CHAIN_T_NAT)
- ops->nat_hook = true;
-
chain->flags |= NFT_BASE_CHAIN;
basechain->policy = policy;
} else {
@@ -1481,13 +1550,31 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
goto err1;
}
+ rules = nf_tables_chain_alloc_rules(chain, 0);
+ if (!rules) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ *rules = NULL;
+ rcu_assign_pointer(chain->rules_gen_0, rules);
+ rcu_assign_pointer(chain->rules_gen_1, rules);
+
err = nf_tables_register_hook(net, table, chain);
if (err < 0)
goto err1;
+ err = rhltable_insert_key(&table->chains_ht, chain->name,
+ &chain->rhlhead, nft_chain_ht_params);
+ if (err)
+ goto err2;
+
err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
- if (err < 0)
+ if (err < 0) {
+ rhltable_remove(&table->chains_ht, &chain->rhlhead,
+ nft_chain_ht_params);
goto err2;
+ }
table->use++;
list_add_tail_rcu(&chain->list, &table->chains);
@@ -1544,8 +1631,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
nla[NFTA_CHAIN_NAME]) {
struct nft_chain *chain2;
- chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME],
- genmask);
+ chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
if (!IS_ERR(chain2))
return -EEXIST;
}
@@ -1595,9 +1681,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
struct netlink_ext_ack *extack)
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- const struct nlattr * uninitialized_var(name);
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
+ const struct nlattr *attr;
struct nft_table *table;
struct nft_chain *chain;
u8 policy = NF_ACCEPT;
@@ -1607,36 +1693,46 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
- table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
+ }
chain = NULL;
- name = nla[NFTA_CHAIN_NAME];
+ attr = nla[NFTA_CHAIN_NAME];
if (nla[NFTA_CHAIN_HANDLE]) {
handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
- chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
- if (IS_ERR(chain))
+ chain = nft_chain_lookup_byhandle(table, handle, genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]);
return PTR_ERR(chain);
+ }
+ attr = nla[NFTA_CHAIN_HANDLE];
} else {
- chain = nf_tables_chain_lookup(table, name, genmask);
+ chain = nft_chain_lookup(table, attr, genmask);
if (IS_ERR(chain)) {
- if (PTR_ERR(chain) != -ENOENT)
+ if (PTR_ERR(chain) != -ENOENT) {
+ NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(chain);
+ }
chain = NULL;
}
}
if (nla[NFTA_CHAIN_POLICY]) {
if (chain != NULL &&
- !nft_is_base_chain(chain))
+ !nft_is_base_chain(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
return -EOPNOTSUPP;
+ }
if (chain == NULL &&
- nla[NFTA_CHAIN_HOOK] == NULL)
+ nla[NFTA_CHAIN_HOOK] == NULL) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
return -EOPNOTSUPP;
+ }
policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
switch (policy) {
@@ -1651,8 +1747,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
if (chain != NULL) {
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
+ }
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
@@ -1669,28 +1767,34 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
+ int family = nfmsg->nfgen_family;
+ const struct nlattr *attr;
struct nft_table *table;
struct nft_chain *chain;
struct nft_rule *rule;
- int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
u64 handle;
u32 use;
int err;
- table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
+ }
if (nla[NFTA_CHAIN_HANDLE]) {
- handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
- chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
+ attr = nla[NFTA_CHAIN_HANDLE];
+ handle = be64_to_cpu(nla_get_be64(attr));
+ chain = nft_chain_lookup_byhandle(table, handle, genmask);
} else {
- chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+ attr = nla[NFTA_CHAIN_NAME];
+ chain = nft_chain_lookup(table, attr, genmask);
}
- if (IS_ERR(chain))
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(chain);
+ }
if (nlh->nlmsg_flags & NLM_F_NONREC &&
chain->use > 0)
@@ -1712,8 +1816,10 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
/* There are rules and elements that are still holding references to us,
* we cannot do a recursive removal in this case.
*/
- if (use > 0)
+ if (use > 0) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
+ }
return nft_delchain(&ctx);
}
@@ -1905,19 +2011,7 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx,
goto err1;
}
- if (ops->validate) {
- const struct nft_data *data = NULL;
-
- err = ops->validate(ctx, expr, &data);
- if (err < 0)
- goto err2;
- }
-
return 0;
-
-err2:
- if (ops->destroy)
- ops->destroy(ctx, expr);
err1:
expr->ops = NULL;
return err;
@@ -1970,13 +2064,13 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
* Rules
*/
-static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
- u64 handle)
+static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
+ u64 handle)
{
struct nft_rule *rule;
// FIXME: this sucks
- list_for_each_entry(rule, &chain->rules, list) {
+ list_for_each_entry_rcu(rule, &chain->rules, list) {
if (handle == rule->handle)
return rule;
}
@@ -1984,13 +2078,13 @@ static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
return ERR_PTR(-ENOENT);
}
-static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
- const struct nlattr *nla)
+static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain,
+ const struct nlattr *nla)
{
if (nla == NULL)
return ERR_PTR(-EINVAL);
- return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+ return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
}
static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
@@ -2172,6 +2266,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
return 0;
}
+/* called with rcu_read_lock held */
static int nf_tables_getrule(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -2179,9 +2274,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_cur(net);
- const struct nft_table *table;
const struct nft_chain *chain;
const struct nft_rule *rule;
+ struct nft_table *table;
struct sk_buff *skb2;
int family = nfmsg->nfgen_family;
int err;
@@ -2190,18 +2285,19 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
struct netlink_dump_control c = {
.dump = nf_tables_dump_rules,
.done = nf_tables_dump_rules_done,
+ .module = THIS_MODULE,
};
if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
struct nft_rule_dump_ctx *ctx;
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
if (!ctx)
return -ENOMEM;
if (nla[NFTA_RULE_TABLE]) {
ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
- GFP_KERNEL);
+ GFP_ATOMIC);
if (!ctx->table) {
kfree(ctx);
return -ENOMEM;
@@ -2209,7 +2305,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
}
if (nla[NFTA_RULE_CHAIN]) {
ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
- GFP_KERNEL);
+ GFP_ATOMIC);
if (!ctx->chain) {
kfree(ctx->table);
kfree(ctx);
@@ -2219,23 +2315,28 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
c.data = ctx;
}
- return netlink_dump_start(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
- table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
+ }
- chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
- if (IS_ERR(chain))
+ chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
+ }
- rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
- if (IS_ERR(rule))
+ rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ if (IS_ERR(rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
+ }
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
@@ -2276,6 +2377,53 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx,
nf_tables_rule_destroy(ctx, rule);
}
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+{
+ struct nft_expr *expr, *last;
+ const struct nft_data *data;
+ struct nft_rule *rule;
+ int err;
+
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (!nft_is_active_next(ctx->net, rule))
+ continue;
+
+ nft_rule_for_each_expr(expr, last, rule) {
+ if (!expr->ops->validate)
+ continue;
+
+ err = expr->ops->validate(ctx, expr, &data);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate);
+
+static int nft_table_validate(struct net *net, const struct nft_table *table)
+{
+ struct nft_chain *chain;
+ struct nft_ctx ctx = {
+ .net = net,
+ .family = table->family,
+ };
+ int err;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_base_chain(chain))
+ continue;
+
+ ctx.chain = chain;
+ err = nft_chain_validate(&ctx, chain);
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
#define NFT_RULE_MAXEXPRS 128
static struct nft_expr_info *info;
@@ -2303,23 +2451,30 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
- table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
+ }
- chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
- if (IS_ERR(chain))
+ chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
+ }
if (nla[NFTA_RULE_HANDLE]) {
handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
- rule = __nf_tables_rule_lookup(chain, handle);
- if (IS_ERR(rule))
+ rule = __nft_rule_lookup(chain, handle);
+ if (IS_ERR(rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
+ }
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return -EEXIST;
+ }
if (nlh->nlmsg_flags & NLM_F_REPLACE)
old_rule = rule;
else
@@ -2338,9 +2493,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
return -EOPNOTSUPP;
pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
- old_rule = __nf_tables_rule_lookup(chain, pos_handle);
- if (IS_ERR(old_rule))
+ old_rule = __nft_rule_lookup(chain, pos_handle);
+ if (IS_ERR(old_rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
return PTR_ERR(old_rule);
+ }
}
nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
@@ -2394,6 +2551,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
err = nf_tables_newexpr(&ctx, &info[i], expr);
if (err < 0)
goto err2;
+
+ if (info[i].ops->validate)
+ nft_validate_state_update(net, NFT_VALIDATE_NEED);
+
info[i].ops = NULL;
expr = nft_expr_next(expr);
}
@@ -2437,8 +2598,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
}
chain->use++;
- return 0;
+ if (net->nft.validate_state == NFT_VALIDATE_DO)
+ return nft_table_validate(net, table);
+
+ return 0;
err2:
nf_tables_rule_release(&ctx, rule);
err1:
@@ -2478,32 +2642,37 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
int family = nfmsg->nfgen_family, err = 0;
struct nft_ctx ctx;
- table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
+ }
if (nla[NFTA_RULE_CHAIN]) {
- chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN],
- genmask);
- if (IS_ERR(chain))
+ chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
+ }
}
nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
if (chain) {
if (nla[NFTA_RULE_HANDLE]) {
- rule = nf_tables_rule_lookup(chain,
- nla[NFTA_RULE_HANDLE]);
- if (IS_ERR(rule))
+ rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ if (IS_ERR(rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
+ }
err = nft_delrule(&ctx, rule);
} else if (nla[NFTA_RULE_ID]) {
rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
- if (IS_ERR(rule))
+ if (IS_ERR(rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]);
return PTR_ERR(rule);
+ }
err = nft_delrule(&ctx, rule);
} else {
@@ -2548,14 +2717,12 @@ void nft_unregister_set(struct nft_set_type *type)
EXPORT_SYMBOL_GPL(nft_unregister_set);
#define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \
- NFT_SET_TIMEOUT | NFT_SET_OBJECT)
+ NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
+ NFT_SET_EVAL)
-static bool nft_set_ops_candidate(const struct nft_set_ops *ops, u32 flags)
+static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags)
{
- if ((flags & NFT_SET_EVAL) && !ops->update)
- return false;
-
- return (flags & ops->features) == (flags & NFT_SET_FEATURES);
+ return (flags & type->features) == (flags & NFT_SET_FEATURES);
}
/*
@@ -2592,14 +2759,9 @@ nft_select_set_ops(const struct nft_ctx *ctx,
best.space = ~0;
list_for_each_entry(type, &nf_tables_set_types, list) {
- if (!type->select_ops)
- ops = type->ops;
- else
- ops = type->select_ops(ctx, desc, flags);
- if (!ops)
- continue;
+ ops = &type->ops;
- if (!nft_set_ops_candidate(ops, flags))
+ if (!nft_set_ops_candidate(type, flags))
continue;
if (!ops->estimate(desc, flags, &est))
continue;
@@ -2630,7 +2792,7 @@ nft_select_set_ops(const struct nft_ctx *ctx,
if (!try_module_get(type->owner))
continue;
if (bops != NULL)
- module_put(bops->type->owner);
+ module_put(to_set_type(bops)->owner);
bops = ops;
best = est;
@@ -2671,6 +2833,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
+ struct netlink_ext_ack *extack,
u8 genmask)
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -2678,25 +2841,27 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
struct nft_table *table = NULL;
if (nla[NFTA_SET_TABLE] != NULL) {
- table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE],
- family, genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
+ genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
return PTR_ERR(table);
+ }
}
nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
return 0;
}
-static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
- const struct nlattr *nla, u8 genmask)
+static struct nft_set *nft_set_lookup(const struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
{
struct nft_set *set;
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry(set, &table->sets, list) {
+ list_for_each_entry_rcu(set, &table->sets, list) {
if (!nla_strcmp(nla, set->name) &&
nft_active_genmask(set, genmask))
return set;
@@ -2704,14 +2869,12 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
return ERR_PTR(-ENOENT);
}
-static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table,
- const struct nlattr *nla, u8 genmask)
+static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
+ const struct nlattr *nla,
+ u8 genmask)
{
struct nft_set *set;
- if (nla == NULL)
- return ERR_PTR(-EINVAL);
-
list_for_each_entry(set, &table->sets, list) {
if (be64_to_cpu(nla_get_be64(nla)) == set->handle &&
nft_active_genmask(set, genmask))
@@ -2720,9 +2883,8 @@ static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *tab
return ERR_PTR(-ENOENT);
}
-static struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
- const struct nlattr *nla,
- u8 genmask)
+static struct nft_set *nft_set_lookup_byid(const struct net *net,
+ const struct nlattr *nla, u8 genmask)
{
struct nft_trans *trans;
u32 id = ntohl(nla_get_be32(nla));
@@ -2746,12 +2908,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
{
struct nft_set *set;
- set = nf_tables_set_lookup(table, nla_set_name, genmask);
+ set = nft_set_lookup(table, nla_set_name, genmask);
if (IS_ERR(set)) {
if (!nla_set_id)
return set;
- set = nf_tables_set_lookup_byid(net, nla_set_id, genmask);
+ set = nft_set_lookup_byid(net, nla_set_id, genmask);
}
return set;
}
@@ -2811,6 +2973,27 @@ cont:
return 0;
}
+static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
+{
+ u64 ms = be64_to_cpu(nla_get_be64(nla));
+ u64 max = (u64)(~((u64)0));
+
+ max = div_u64(max, NSEC_PER_MSEC);
+ if (ms >= max)
+ return -ERANGE;
+
+ ms *= NSEC_PER_MSEC;
+ *result = nsecs_to_jiffies64(ms);
+ return 0;
+}
+
+static __be64 nf_jiffies64_to_msecs(u64 input)
+{
+ u64 ms = jiffies64_to_nsecs(input);
+
+ return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC));
+}
+
static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
const struct nft_set *set, u16 event, u16 flags)
{
@@ -2858,7 +3041,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (set->timeout &&
nla_put_be64(skb, NFTA_SET_TIMEOUT,
- cpu_to_be64(jiffies_to_msecs(set->timeout)),
+ nf_jiffies64_to_msecs(set->timeout),
NFTA_SET_PAD))
goto nla_put_failure;
if (set->gc_int &&
@@ -2983,6 +3166,7 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
return 0;
}
+/* called with rcu_read_lock held */
static int nf_tables_getset(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -2996,7 +3180,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
int err;
/* Verify existence before starting dump */
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
+ genmask);
if (err < 0)
return err;
@@ -3004,17 +3189,18 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
struct netlink_dump_control c = {
.dump = nf_tables_dump_sets,
.done = nf_tables_dump_sets_done,
+ .module = THIS_MODULE,
};
struct nft_ctx *ctx_dump;
- ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_KERNEL);
+ ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC);
if (ctx_dump == NULL)
return -ENOMEM;
*ctx_dump = ctx;
c.data = ctx_dump;
- return netlink_dump_start(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
/* Only accept unspec with dump */
@@ -3023,11 +3209,11 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
if (!nla[NFTA_SET_TABLE])
return -EINVAL;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
+ set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb2 == NULL)
return -ENOMEM;
@@ -3153,8 +3339,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (nla[NFTA_SET_TIMEOUT] != NULL) {
if (!(flags & NFT_SET_TIMEOUT))
return -EINVAL;
- timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
- nla[NFTA_SET_TIMEOUT])));
+
+ err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout);
+ if (err)
+ return err;
}
gc_int = 0;
if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
@@ -3175,22 +3363,28 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
- table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
return PTR_ERR(table);
+ }
nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
- set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask);
+ set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set)) {
- if (PTR_ERR(set) != -ENOENT)
+ if (PTR_ERR(set) != -ENOENT) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return PTR_ERR(set);
+ }
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return -EEXIST;
+ }
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
+
return 0;
}
@@ -3233,6 +3427,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
}
INIT_LIST_HEAD(&set->bindings);
+ set->table = table;
+ write_pnet(&set->net, net);
set->ops = ops;
set->ktype = ktype;
set->klen = desc.klen;
@@ -3267,14 +3463,14 @@ err3:
err2:
kvfree(set);
err1:
- module_put(ops->type->owner);
+ module_put(to_set_type(ops)->owner);
return err;
}
static void nft_set_destroy(struct nft_set *set)
{
set->ops->destroy(set);
- module_put(set->ops->type->owner);
+ module_put(to_set_type(set->ops)->owner);
kfree(set->name);
kvfree(set);
}
@@ -3293,6 +3489,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
+ const struct nlattr *attr;
struct nft_set *set;
struct nft_ctx ctx;
int err;
@@ -3302,20 +3499,28 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
if (nla[NFTA_SET_TABLE] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
+ genmask);
if (err < 0)
return err;
- if (nla[NFTA_SET_HANDLE])
- set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask);
- else
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
- if (IS_ERR(set))
- return PTR_ERR(set);
+ if (nla[NFTA_SET_HANDLE]) {
+ attr = nla[NFTA_SET_HANDLE];
+ set = nft_set_lookup_byhandle(ctx.table, attr, genmask);
+ } else {
+ attr = nla[NFTA_SET_NAME];
+ set = nft_set_lookup(ctx.table, attr, genmask);
+ }
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, attr);
+ return PTR_ERR(set);
+ }
if (!list_empty(&set->bindings) ||
- (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0))
+ (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
+ }
return nft_delset(&ctx, set);
}
@@ -3405,8 +3610,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
.align = __alignof__(u64),
},
[NFT_SET_EXT_EXPIRATION] = {
- .len = sizeof(unsigned long),
- .align = __alignof__(unsigned long),
+ .len = sizeof(u64),
+ .align = __alignof__(u64),
},
[NFT_SET_EXT_USERDATA] = {
.len = sizeof(struct nft_userdata),
@@ -3443,16 +3648,19 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
+ struct netlink_ext_ack *extack,
u8 genmask)
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
int family = nfmsg->nfgen_family;
struct nft_table *table;
- table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE],
- family, genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
return PTR_ERR(table);
+ }
nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
return 0;
@@ -3496,22 +3704,21 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
- cpu_to_be64(jiffies_to_msecs(
- *nft_set_ext_timeout(ext))),
+ nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)),
NFTA_SET_ELEM_PAD))
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
- unsigned long expires, now = jiffies;
+ u64 expires, now = get_jiffies_64();
expires = *nft_set_ext_expiration(ext);
- if (time_before(now, expires))
+ if (time_before64(now, expires))
expires -= now;
else
expires = 0;
if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
- cpu_to_be64(jiffies_to_msecs(expires)),
+ nf_jiffies64_to_msecs(expires),
NFTA_SET_ELEM_PAD))
goto nla_put_failure;
}
@@ -3749,7 +3956,7 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
ext = nft_set_elem_ext(set, &elem);
err = -ENOMEM;
- skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb == NULL)
goto err1;
@@ -3771,6 +3978,7 @@ err1:
return err == -EAGAIN ? -ENOBUFS : err;
}
+/* called with rcu_read_lock held */
static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -3782,12 +3990,12 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
struct nft_ctx ctx;
int rem, err = 0;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+ genmask);
if (err < 0)
return err;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
- genmask);
+ set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
@@ -3795,10 +4003,11 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
struct netlink_dump_control c = {
.dump = nf_tables_dump_set,
.done = nf_tables_dump_set_done,
+ .module = THIS_MODULE,
};
struct nft_set_dump_ctx *dump_ctx;
- dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL);
+ dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC);
if (!dump_ctx)
return -ENOMEM;
@@ -3806,7 +4015,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
dump_ctx->ctx = ctx;
c.data = dump_ctx;
- return netlink_dump_start(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
@@ -3886,7 +4095,7 @@ void *nft_set_elem_init(const struct nft_set *set,
memcpy(nft_set_ext_data(ext), data, set->dlen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
*nft_set_ext_expiration(ext) =
- jiffies + timeout;
+ get_jiffies_64() + timeout;
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
*nft_set_ext_timeout(ext) = timeout;
@@ -3897,12 +4106,24 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
bool destroy_expr)
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+ struct nft_ctx ctx = {
+ .net = read_pnet(&set->net),
+ .family = set->table->family,
+ };
nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_release(nft_set_ext_data(ext), set->dtype);
- if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+ if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
+ struct nft_expr *expr = nft_set_ext_expr(ext);
+
+ if (expr->ops->destroy_clone) {
+ expr->ops->destroy_clone(&ctx, expr);
+ module_put(expr->ops->type->owner);
+ } else {
+ nf_tables_expr_destroy(&ctx, expr);
+ }
+ }
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
(*nft_set_ext_obj(ext))->use--;
kfree(elem);
@@ -3912,12 +4133,13 @@ EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
/* Only called from commit path, nft_set_elem_deactivate() already deals with
* the refcounting from the preparation phase.
*/
-static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem)
+static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set, void *elem)
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+ nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
kfree(elem);
}
@@ -3973,8 +4195,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
return -EINVAL;
- timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
- nla[NFTA_SET_ELEM_TIMEOUT])));
+ err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT],
+ &timeout);
+ if (err)
+ return err;
} else if (set->flags & NFT_SET_TIMEOUT) {
timeout = set->timeout;
}
@@ -3999,8 +4223,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = -EINVAL;
goto err2;
}
- obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
- set->objtype, genmask);
+ obj = nft_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
+ set->objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
goto err2;
@@ -4035,6 +4259,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
d2.type, d2.len);
if (err < 0)
goto err3;
+
+ if (d2.type == NFT_DATA_VERDICT &&
+ (data.verdict.code == NFT_GOTO ||
+ data.verdict.code == NFT_JUMP))
+ nft_validate_state_update(ctx->net,
+ NFT_VALIDATE_NEED);
}
nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
@@ -4134,12 +4364,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
const struct nlattr *attr;
struct nft_set *set;
struct nft_ctx ctx;
- int rem, err = 0;
+ int rem, err;
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+ genmask);
if (err < 0)
return err;
@@ -4154,9 +4385,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
if (err < 0)
- break;
+ return err;
}
- return err;
+
+ if (net->nft.validate_state == NFT_VALIDATE_DO)
+ return nft_table_validate(net, ctx.table);
+
+ return 0;
}
/**
@@ -4327,12 +4562,12 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
struct nft_ctx ctx;
int rem, err = 0;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+ genmask);
if (err < 0)
return err;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
- genmask);
+ set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
@@ -4420,13 +4655,13 @@ void nft_unregister_obj(struct nft_object_type *obj_type)
}
EXPORT_SYMBOL_GPL(nft_unregister_obj);
-struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
- const struct nlattr *nla,
- u32 objtype, u8 genmask)
+struct nft_object *nft_obj_lookup(const struct nft_table *table,
+ const struct nlattr *nla, u32 objtype,
+ u8 genmask)
{
struct nft_object *obj;
- list_for_each_entry(obj, &table->objects, list) {
+ list_for_each_entry_rcu(obj, &table->objects, list) {
if (!nla_strcmp(nla, obj->name) &&
objtype == obj->ops->type->type &&
nft_active_genmask(obj, genmask))
@@ -4434,11 +4669,11 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
}
return ERR_PTR(-ENOENT);
}
-EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
+EXPORT_SYMBOL_GPL(nft_obj_lookup);
-static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table,
- const struct nlattr *nla,
- u32 objtype, u8 genmask)
+static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table,
+ const struct nlattr *nla,
+ u32 objtype, u8 genmask)
{
struct nft_object *obj;
@@ -4582,22 +4817,25 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
!nla[NFTA_OBJ_DATA])
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
+ }
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
- if (err != -ENOENT)
+ if (err != -ENOENT) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
return err;
-
+ }
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
return -EEXIST;
-
+ }
return 0;
}
@@ -4632,7 +4870,7 @@ err3:
kfree(obj->name);
err2:
if (obj->ops->destroy)
- obj->ops->destroy(obj);
+ obj->ops->destroy(&ctx, obj);
kfree(obj);
err1:
module_put(type->owner);
@@ -4753,12 +4991,12 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
{
struct nft_obj_filter *filter;
- filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
if (!filter)
return ERR_PTR(-ENOMEM);
if (nla[NFTA_OBJ_TABLE]) {
- filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL);
+ filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
if (!filter->table) {
kfree(filter);
return ERR_PTR(-ENOMEM);
@@ -4770,6 +5008,7 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
return filter;
}
+/* called with rcu_read_lock held */
static int nf_tables_getobj(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -4789,6 +5028,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
struct netlink_dump_control c = {
.dump = nf_tables_dump_obj,
.done = nf_tables_dump_obj_done,
+ .module = THIS_MODULE,
};
if (nla[NFTA_OBJ_TABLE] ||
@@ -4801,24 +5041,27 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
c.data = filter;
}
- return netlink_dump_start(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
if (!nla[NFTA_OBJ_NAME] ||
!nla[NFTA_OBJ_TYPE])
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
+ }
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
- if (IS_ERR(obj))
+ obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
return PTR_ERR(obj);
+ }
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
@@ -4837,10 +5080,10 @@ err:
return err;
}
-static void nft_obj_destroy(struct nft_object *obj)
+static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
{
if (obj->ops->destroy)
- obj->ops->destroy(obj);
+ obj->ops->destroy(ctx, obj);
module_put(obj->ops->type->owner);
kfree(obj->name);
@@ -4855,6 +5098,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
+ const struct nlattr *attr;
struct nft_table *table;
struct nft_object *obj;
struct nft_ctx ctx;
@@ -4864,22 +5108,29 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
(!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
+ }
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- if (nla[NFTA_OBJ_HANDLE])
- obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE],
- objtype, genmask);
- else
- obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME],
- objtype, genmask);
- if (IS_ERR(obj))
+ if (nla[NFTA_OBJ_HANDLE]) {
+ attr = nla[NFTA_OBJ_HANDLE];
+ obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask);
+ } else {
+ attr = nla[NFTA_OBJ_NAME];
+ obj = nft_obj_lookup(table, attr, objtype, genmask);
+ }
+
+ if (IS_ERR(obj)) {
+ NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(obj);
- if (obj->use > 0)
+ }
+ if (obj->use > 0) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
+ }
nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
@@ -4950,24 +5201,23 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
[NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 },
};
-struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
- const struct nlattr *nla,
- u8 genmask)
+struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
{
struct nft_flowtable *flowtable;
- list_for_each_entry(flowtable, &table->flowtables, list) {
+ list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
if (!nla_strcmp(nla, flowtable->name) &&
nft_active_genmask(flowtable, genmask))
return flowtable;
}
return ERR_PTR(-ENOENT);
}
-EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup);
+EXPORT_SYMBOL_GPL(nft_flowtable_lookup);
static struct nft_flowtable *
-nf_tables_flowtable_lookup_byhandle(const struct nft_table *table,
- const struct nlattr *nla, u8 genmask)
+nft_flowtable_lookup_byhandle(const struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
{
struct nft_flowtable *flowtable;
@@ -5066,7 +5316,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
flowtable->ops[i].pf = NFPROTO_NETDEV;
flowtable->ops[i].hooknum = hooknum;
flowtable->ops[i].priority = priority;
- flowtable->ops[i].priv = &flowtable->data.rhashtable;
+ flowtable->ops[i].priv = &flowtable->data;
flowtable->ops[i].hook = flowtable->data.type->hook;
flowtable->ops[i].dev = dev_array[i];
flowtable->dev_name[i] = kstrdup(dev_array[i]->name,
@@ -5107,23 +5357,6 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
return ERR_PTR(-ENOENT);
}
-void nft_flow_table_iterate(struct net *net,
- void (*iter)(struct nf_flowtable *flowtable, void *data),
- void *data)
-{
- struct nft_flowtable *flowtable;
- const struct nft_table *table;
-
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_for_each_entry(table, &net->nft.tables, list) {
- list_for_each_entry(flowtable, &table->flowtables, list) {
- iter(&flowtable->data, data);
- }
- }
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
-
static void nft_unregister_flowtable_net_hooks(struct net *net,
struct nft_flowtable *flowtable)
{
@@ -5157,20 +5390,26 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
!nla[NFTA_FLOWTABLE_HOOK])
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
- family, genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+ genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
+ }
- flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
- genmask);
+ flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+ genmask);
if (IS_ERR(flowtable)) {
err = PTR_ERR(flowtable);
- if (err != -ENOENT)
+ if (err != -ENOENT) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return err;
+ }
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return -EEXIST;
+ }
return 0;
}
@@ -5197,14 +5436,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
}
flowtable->data.type = type;
- err = rhashtable_init(&flowtable->data.rhashtable, type->params);
+ err = type->init(&flowtable->data);
if (err < 0)
goto err3;
err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
flowtable);
if (err < 0)
- goto err3;
+ goto err4;
for (i = 0; i < flowtable->ops_len; i++) {
if (!flowtable->ops[i].dev)
@@ -5218,37 +5457,35 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
if (flowtable->ops[i].dev == ft->ops[k].dev &&
flowtable->ops[i].pf == ft->ops[k].pf) {
err = -EBUSY;
- goto err4;
+ goto err5;
}
}
}
err = nf_register_net_hook(net, &flowtable->ops[i]);
if (err < 0)
- goto err4;
+ goto err5;
}
err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
if (err < 0)
- goto err5;
-
- INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc);
- queue_delayed_work(system_power_efficient_wq,
- &flowtable->data.gc_work, HZ);
+ goto err6;
list_add_tail_rcu(&flowtable->list, &table->flowtables);
table->use++;
return 0;
-err5:
+err6:
i = flowtable->ops_len;
-err4:
+err5:
for (k = i - 1; k >= 0; k--) {
kfree(flowtable->dev_name[k]);
nf_unregister_net_hook(net, &flowtable->ops[k]);
}
kfree(flowtable->ops);
+err4:
+ flowtable->data.type->free(&flowtable->data);
err3:
module_put(type->owner);
err2:
@@ -5268,6 +5505,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
+ const struct nlattr *attr;
struct nft_table *table;
struct nft_ctx ctx;
@@ -5276,23 +5514,29 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
!nla[NFTA_FLOWTABLE_HANDLE]))
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
- family, genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+ genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
+ }
- if (nla[NFTA_FLOWTABLE_HANDLE])
- flowtable = nf_tables_flowtable_lookup_byhandle(table,
- nla[NFTA_FLOWTABLE_HANDLE],
- genmask);
- else
- flowtable = nf_tables_flowtable_lookup(table,
- nla[NFTA_FLOWTABLE_NAME],
- genmask);
- if (IS_ERR(flowtable))
- return PTR_ERR(flowtable);
- if (flowtable->use > 0)
+ if (nla[NFTA_FLOWTABLE_HANDLE]) {
+ attr = nla[NFTA_FLOWTABLE_HANDLE];
+ flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask);
+ } else {
+ attr = nla[NFTA_FLOWTABLE_NAME];
+ flowtable = nft_flowtable_lookup(table, attr, genmask);
+ }
+
+ if (IS_ERR(flowtable)) {
+ NL_SET_BAD_ATTR(extack, attr);
+ return PTR_ERR(flowtable);
+ }
+ if (flowtable->use > 0) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
+ }
nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
@@ -5423,13 +5667,13 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
{
struct nft_flowtable_filter *filter;
- filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
if (!filter)
return ERR_PTR(-ENOMEM);
if (nla[NFTA_FLOWTABLE_TABLE]) {
filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
- GFP_KERNEL);
+ GFP_ATOMIC);
if (!filter->table) {
kfree(filter);
return ERR_PTR(-ENOMEM);
@@ -5438,6 +5682,7 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
return filter;
}
+/* called with rcu_read_lock held */
static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
struct sk_buff *skb,
const struct nlmsghdr *nlh,
@@ -5456,6 +5701,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
struct netlink_dump_control c = {
.dump = nf_tables_dump_flowtable,
.done = nf_tables_dump_flowtable_done,
+ .module = THIS_MODULE,
};
if (nla[NFTA_FLOWTABLE_TABLE]) {
@@ -5467,23 +5713,23 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
c.data = filter;
}
- return netlink_dump_start(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
if (!nla[NFTA_FLOWTABLE_NAME])
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
- family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+ genmask);
if (IS_ERR(table))
return PTR_ERR(table);
- flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
- genmask);
+ flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+ genmask);
if (IS_ERR(flowtable))
return PTR_ERR(flowtable);
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
@@ -5532,11 +5778,9 @@ err:
static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
{
- cancel_delayed_work_sync(&flowtable->data.gc_work);
kfree(flowtable->ops);
kfree(flowtable->name);
flowtable->data.type->free(&flowtable->data);
- rhashtable_destroy(&flowtable->data.rhashtable);
module_put(flowtable->data.type->owner);
}
@@ -5649,7 +5893,7 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
struct sk_buff *skb2;
int err;
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb2 == NULL)
return -ENOMEM;
@@ -5671,7 +5915,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_table_policy,
},
[NFT_MSG_GETTABLE] = {
- .call = nf_tables_gettable,
+ .call_rcu = nf_tables_gettable,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
@@ -5686,7 +5930,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_chain_policy,
},
[NFT_MSG_GETCHAIN] = {
- .call = nf_tables_getchain,
+ .call_rcu = nf_tables_getchain,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
@@ -5701,7 +5945,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_rule_policy,
},
[NFT_MSG_GETRULE] = {
- .call = nf_tables_getrule,
+ .call_rcu = nf_tables_getrule,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
@@ -5716,7 +5960,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_set_policy,
},
[NFT_MSG_GETSET] = {
- .call = nf_tables_getset,
+ .call_rcu = nf_tables_getset,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
@@ -5731,7 +5975,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETSETELEM] = {
- .call = nf_tables_getsetelem,
+ .call_rcu = nf_tables_getsetelem,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
@@ -5741,7 +5985,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETGEN] = {
- .call = nf_tables_getgen,
+ .call_rcu = nf_tables_getgen,
},
[NFT_MSG_NEWOBJ] = {
.call_batch = nf_tables_newobj,
@@ -5749,7 +5993,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ] = {
- .call = nf_tables_getobj,
+ .call_rcu = nf_tables_getobj,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
@@ -5759,7 +6003,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ_RESET] = {
- .call = nf_tables_getobj,
+ .call_rcu = nf_tables_getobj,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
@@ -5769,7 +6013,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_flowtable_policy,
},
[NFT_MSG_GETFLOWTABLE] = {
- .call = nf_tables_getflowtable,
+ .call_rcu = nf_tables_getflowtable,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
@@ -5780,12 +6024,41 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
},
};
+static int nf_tables_validate(struct net *net)
+{
+ struct nft_table *table;
+
+ switch (net->nft.validate_state) {
+ case NFT_VALIDATE_SKIP:
+ break;
+ case NFT_VALIDATE_NEED:
+ nft_validate_state_update(net, NFT_VALIDATE_DO);
+ /* fall through */
+ case NFT_VALIDATE_DO:
+ list_for_each_entry(table, &net->nft.tables, list) {
+ if (nft_table_validate(net, table) < 0)
+ return -EAGAIN;
+ }
+ break;
+ }
+
+ return 0;
+}
+
static void nft_chain_commit_update(struct nft_trans *trans)
{
struct nft_base_chain *basechain;
- if (nft_trans_chain_name(trans))
+ if (nft_trans_chain_name(trans)) {
+ rhltable_remove(&trans->ctx.table->chains_ht,
+ &trans->ctx.chain->rhlhead,
+ nft_chain_ht_params);
swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
+ rhltable_insert_key(&trans->ctx.table->chains_ht,
+ trans->ctx.chain->name,
+ &trans->ctx.chain->rhlhead,
+ nft_chain_ht_params);
+ }
if (!nft_is_base_chain(trans->ctx.chain))
return;
@@ -5817,11 +6090,12 @@ static void nft_commit_release(struct nft_trans *trans)
nft_set_destroy(nft_trans_set(trans));
break;
case NFT_MSG_DELSETELEM:
- nf_tables_set_elem_destroy(nft_trans_elem_set(trans),
+ nf_tables_set_elem_destroy(&trans->ctx,
+ nft_trans_elem_set(trans),
nft_trans_elem(trans).priv);
break;
case NFT_MSG_DELOBJ:
- nft_obj_destroy(nft_trans_obj(trans));
+ nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
break;
case NFT_MSG_DELFLOWTABLE:
nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -5845,21 +6119,175 @@ static void nf_tables_commit_release(struct net *net)
}
}
+static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
+{
+ struct nft_rule *rule;
+ unsigned int alloc = 0;
+ int i;
+
+ /* already handled or inactive chain? */
+ if (chain->rules_next || !nft_is_active_next(net, chain))
+ return 0;
+
+ rule = list_entry(&chain->rules, struct nft_rule, list);
+ i = 0;
+
+ list_for_each_entry_continue(rule, &chain->rules, list) {
+ if (nft_is_active_next(net, rule))
+ alloc++;
+ }
+
+ chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
+ if (!chain->rules_next)
+ return -ENOMEM;
+
+ list_for_each_entry_continue(rule, &chain->rules, list) {
+ if (nft_is_active_next(net, rule))
+ chain->rules_next[i++] = rule;
+ }
+
+ chain->rules_next[i] = NULL;
+ return 0;
+}
+
+static void nf_tables_commit_chain_prepare_cancel(struct net *net)
+{
+ struct nft_trans *trans, *next;
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ struct nft_chain *chain = trans->ctx.chain;
+
+ if (trans->msg_type == NFT_MSG_NEWRULE ||
+ trans->msg_type == NFT_MSG_DELRULE) {
+ kvfree(chain->rules_next);
+ chain->rules_next = NULL;
+ }
+ }
+}
+
+static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
+{
+ struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
+
+ kvfree(o->start);
+}
+
+static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
+{
+ struct nft_rule **r = rules;
+ struct nft_rules_old *old;
+
+ while (*r)
+ r++;
+
+ r++; /* rcu_head is after end marker */
+ old = (void *) r;
+ old->start = rules;
+
+ call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
+}
+
+static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain)
+{
+ struct nft_rule **g0, **g1;
+ bool next_genbit;
+
+ next_genbit = nft_gencursor_next(net);
+
+ g0 = rcu_dereference_protected(chain->rules_gen_0,
+ lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ g1 = rcu_dereference_protected(chain->rules_gen_1,
+ lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+
+ /* No changes to this chain? */
+ if (chain->rules_next == NULL) {
+ /* chain had no change in last or next generation */
+ if (g0 == g1)
+ return;
+ /*
+ * chain had no change in this generation; make sure next
+ * one uses same rules as current generation.
+ */
+ if (next_genbit) {
+ rcu_assign_pointer(chain->rules_gen_1, g0);
+ nf_tables_commit_chain_free_rules_old(g1);
+ } else {
+ rcu_assign_pointer(chain->rules_gen_0, g1);
+ nf_tables_commit_chain_free_rules_old(g0);
+ }
+
+ return;
+ }
+
+ if (next_genbit)
+ rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
+ else
+ rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
+
+ chain->rules_next = NULL;
+
+ if (g0 == g1)
+ return;
+
+ if (next_genbit)
+ nf_tables_commit_chain_free_rules_old(g1);
+ else
+ nf_tables_commit_chain_free_rules_old(g0);
+}
+
+static void nft_chain_del(struct nft_chain *chain)
+{
+ struct nft_table *table = chain->table;
+
+ WARN_ON_ONCE(rhltable_remove(&table->chains_ht, &chain->rhlhead,
+ nft_chain_ht_params));
+ list_del_rcu(&chain->list);
+}
+
static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
+ struct nft_chain *chain;
+ struct nft_table *table;
- /* Bump generation counter, invalidate any dump in progress */
- while (++net->nft.base_seq == 0);
+ /* 0. Validate ruleset, otherwise roll back for error reporting. */
+ if (nf_tables_validate(net) < 0)
+ return -EAGAIN;
- /* A new generation has just started */
- net->nft.gencursor = nft_gencursor_next(net);
+ /* 1. Allocate space for next generation rules_gen_X[] */
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ int ret;
+
+ if (trans->msg_type == NFT_MSG_NEWRULE ||
+ trans->msg_type == NFT_MSG_DELRULE) {
+ chain = trans->ctx.chain;
+
+ ret = nf_tables_commit_chain_prepare(net, chain);
+ if (ret < 0) {
+ nf_tables_commit_chain_prepare_cancel(net);
+ return ret;
+ }
+ }
+ }
+
+ /* step 2. Make rules_gen_X visible to packet path */
+ list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_active_next(net, chain))
+ continue;
+ nf_tables_commit_chain_active(net, chain);
+ }
+ }
- /* Make sure all packets have left the previous generation before
- * purging old rules.
+ /*
+ * Bump generation counter, invalidate any dump in progress.
+ * Cannot fail after this point.
*/
- synchronize_rcu();
+ while (++net->nft.base_seq == 0);
+
+ /* step 3. Start new generation, rules_gen_X now in use. */
+ net->nft.gencursor = nft_gencursor_next(net);
list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
switch (trans->msg_type) {
@@ -5890,7 +6318,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_trans_destroy(trans);
break;
case NFT_MSG_DELCHAIN:
- list_del_rcu(&trans->ctx.chain->list);
+ nft_chain_del(trans->ctx.chain);
nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
nf_tables_unregister_hook(trans->ctx.net,
trans->ctx.table,
@@ -6001,7 +6429,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
nft_trans_elem(trans).priv, true);
break;
case NFT_MSG_NEWOBJ:
- nft_obj_destroy(nft_trans_obj(trans));
+ nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
break;
case NFT_MSG_NEWFLOWTABLE:
nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -6041,7 +6469,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
nft_trans_destroy(trans);
} else {
trans->ctx.table->use--;
- list_del_rcu(&trans->ctx.chain->list);
+ nft_chain_del(trans->ctx.chain);
nf_tables_unregister_hook(trans->ctx.net,
trans->ctx.table,
trans->ctx.chain);
@@ -6121,6 +6549,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
return 0;
}
+static void nf_tables_cleanup(struct net *net)
+{
+ nft_validate_state_update(net, NFT_VALIDATE_SKIP);
+}
+
static bool nf_tables_valid_genid(struct net *net, u32 genid)
{
return net->nft.base_seq == genid;
@@ -6133,6 +6566,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
.cb = nf_tables_cb,
.commit = nf_tables_commit,
.abort = nf_tables_abort,
+ .cleanup = nf_tables_cleanup,
.valid_genid = nf_tables_valid_genid,
};
@@ -6216,19 +6650,18 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
list_for_each_entry(rule, &chain->rules, list) {
nft_rule_for_each_expr(expr, last, rule) {
- const struct nft_data *data = NULL;
+ struct nft_immediate_expr *priv;
+ const struct nft_data *data;
int err;
- if (!expr->ops->validate)
+ if (strcmp(expr->ops->type->name, "immediate"))
continue;
- err = expr->ops->validate(ctx, expr, &data);
- if (err < 0)
- return err;
-
- if (data == NULL)
+ priv = nft_expr_priv(expr);
+ if (priv->dreg != NFT_REG_VERDICT)
continue;
+ data = &priv->data;
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
@@ -6461,8 +6894,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
case NFT_GOTO:
if (!tb[NFTA_VERDICT_CHAIN])
return -EINVAL;
- chain = nf_tables_chain_lookup(ctx->table,
- tb[NFTA_VERDICT_CHAIN], genmask);
+ chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN],
+ genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
if (nft_is_base_chain(chain))
@@ -6638,7 +7071,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
ctx->chain->use--;
nf_tables_rule_release(ctx, rule);
}
- list_del(&ctx->chain->list);
+ nft_chain_del(ctx->chain);
ctx->table->use--;
nf_tables_chain_destroy(ctx);
@@ -6690,11 +7123,11 @@ static void __nft_release_tables(struct net *net)
list_for_each_entry_safe(obj, ne, &table->objects, list) {
list_del(&obj->list);
table->use--;
- nft_obj_destroy(obj);
+ nft_obj_destroy(&ctx, obj);
}
list_for_each_entry_safe(chain, nc, &table->chains, list) {
ctx.chain = chain;
- list_del(&chain->list);
+ nft_chain_del(chain);
table->use--;
nf_tables_chain_destroy(&ctx);
}
@@ -6708,6 +7141,8 @@ static int __net_init nf_tables_init_net(struct net *net)
INIT_LIST_HEAD(&net->nft.tables);
INIT_LIST_HEAD(&net->nft.commit_list);
net->nft.base_seq = 1;
+ net->nft.validate_state = NFT_VALIDATE_SKIP;
+
return 0;
}
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 40e744572283..deff10adef9c 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -23,25 +23,9 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_log.h>
-static const char *const comments[__NFT_TRACETYPE_MAX] = {
- [NFT_TRACETYPE_POLICY] = "policy",
- [NFT_TRACETYPE_RETURN] = "return",
- [NFT_TRACETYPE_RULE] = "rule",
-};
-
-static const struct nf_loginfo trace_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = LOGLEVEL_WARNING,
- .logflags = NF_LOG_DEFAULT_MASK,
- },
- },
-};
-
static noinline void __nft_trace_packet(struct nft_traceinfo *info,
const struct nft_chain *chain,
- int rulenum, enum nft_trace_types type)
+ enum nft_trace_types type)
{
const struct nft_pktinfo *pkt = info->pkt;
@@ -52,22 +36,16 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
info->type = type;
nft_trace_notify(info);
-
- nf_log_trace(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
- nft_in(pkt), nft_out(pkt), &trace_loginfo,
- "TRACE: %s:%s:%s:%u ",
- chain->table->name, chain->name, comments[type], rulenum);
}
static inline void nft_trace_packet(struct nft_traceinfo *info,
const struct nft_chain *chain,
const struct nft_rule *rule,
- int rulenum,
enum nft_trace_types type)
{
if (static_branch_unlikely(&nft_trace_enabled)) {
info->rule = rule;
- __nft_trace_packet(info, chain, rulenum, type);
+ __nft_trace_packet(info, chain, type);
}
}
@@ -139,8 +117,7 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
struct nft_jumpstack {
const struct nft_chain *chain;
- const struct nft_rule *rule;
- int rulenum;
+ struct nft_rule *const *rules;
};
unsigned int
@@ -148,31 +125,29 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
const struct net *net = nft_net(pkt);
+ struct nft_rule *const *rules;
const struct nft_rule *rule;
const struct nft_expr *expr, *last;
struct nft_regs regs;
unsigned int stackptr = 0;
struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
- int rulenum;
- unsigned int gencursor = nft_genmask_cur(net);
+ bool genbit = READ_ONCE(net->nft.gencursor);
struct nft_traceinfo info;
info.trace = false;
if (static_branch_unlikely(&nft_trace_enabled))
nft_trace_init(&info, pkt, &regs.verdict, basechain);
do_chain:
- rulenum = 0;
- rule = list_entry(&chain->rules, struct nft_rule, list);
+ if (genbit)
+ rules = rcu_dereference(chain->rules_gen_1);
+ else
+ rules = rcu_dereference(chain->rules_gen_0);
+
next_rule:
+ rule = *rules;
regs.verdict.code = NFT_CONTINUE;
- list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
-
- /* This rule is not active, skip. */
- if (unlikely(rule->genmask & gencursor))
- continue;
-
- rulenum++;
-
+ for (; *rules ; rules++) {
+ rule = *rules;
nft_rule_for_each_expr(expr, last, rule) {
if (expr->ops == &nft_cmp_fast_ops)
nft_cmp_fast_eval(expr, &regs);
@@ -190,7 +165,7 @@ next_rule:
continue;
case NFT_CONTINUE:
nft_trace_packet(&info, chain, rule,
- rulenum, NFT_TRACETYPE_RULE);
+ NFT_TRACETYPE_RULE);
continue;
}
break;
@@ -202,7 +177,7 @@ next_rule:
case NF_QUEUE:
case NF_STOLEN:
nft_trace_packet(&info, chain, rule,
- rulenum, NFT_TRACETYPE_RULE);
+ NFT_TRACETYPE_RULE);
return regs.verdict.code;
}
@@ -210,22 +185,20 @@ next_rule:
case NFT_JUMP:
BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
jumpstack[stackptr].chain = chain;
- jumpstack[stackptr].rule = rule;
- jumpstack[stackptr].rulenum = rulenum;
+ jumpstack[stackptr].rules = rules + 1;
stackptr++;
/* fall through */
case NFT_GOTO:
nft_trace_packet(&info, chain, rule,
- rulenum, NFT_TRACETYPE_RULE);
+ NFT_TRACETYPE_RULE);
chain = regs.verdict.chain;
goto do_chain;
case NFT_CONTINUE:
- rulenum++;
/* fall through */
case NFT_RETURN:
nft_trace_packet(&info, chain, rule,
- rulenum, NFT_TRACETYPE_RETURN);
+ NFT_TRACETYPE_RETURN);
break;
default:
WARN_ON(1);
@@ -234,13 +207,11 @@ next_rule:
if (stackptr > 0) {
stackptr--;
chain = jumpstack[stackptr].chain;
- rule = jumpstack[stackptr].rule;
- rulenum = jumpstack[stackptr].rulenum;
+ rules = jumpstack[stackptr].rules;
goto next_rule;
}
- nft_trace_packet(&info, basechain, NULL, -1,
- NFT_TRACETYPE_POLICY);
+ nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY);
if (static_branch_unlikely(&nft_counters_enabled))
nft_update_chain_stats(basechain, pkt);
@@ -258,6 +229,9 @@ static struct nft_expr_type *nft_basic_types[] = {
&nft_payload_type,
&nft_dynset_type,
&nft_range_type,
+ &nft_meta_type,
+ &nft_rt_type,
+ &nft_exthdr_type,
};
int __init nf_tables_core_module_init(void)
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 03ead8a9e90c..4d0da7042aff 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -25,6 +25,7 @@
#include <linux/uaccess.h>
#include <net/sock.h>
#include <linux/init.h>
+#include <linux/sched/signal.h>
#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
@@ -37,6 +38,8 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
rcu_dereference_protected(table[(id)].subsys, \
lockdep_nfnl_is_held((id)))
+#define NFNL_MAX_ATTR_COUNT 32
+
static struct {
struct mutex mutex;
const struct nfnetlink_subsystem __rcu *subsys;
@@ -76,6 +79,13 @@ EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
{
+ u8 cb_id;
+
+ /* Sanity-check attr_count size to avoid stack buffer overflow. */
+ for (cb_id = 0; cb_id < n->cb_count; cb_id++)
+ if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT))
+ return -EINVAL;
+
nfnl_lock(n->subsys_id);
if (table[n->subsys_id].subsys) {
nfnl_unlock(n->subsys_id);
@@ -185,11 +195,17 @@ replay:
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
- struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+ struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
__u8 subsys_id = NFNL_SUBSYS_ID(type);
+ /* Sanity-check NFNL_MAX_ATTR_COUNT */
+ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
+
err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen,
ss->cb[cb_id].policy, extack);
if (err < 0) {
@@ -330,6 +346,13 @@ replay:
while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;
+ if (fatal_signal_pending(current)) {
+ nfnl_err_reset(&err_list);
+ err = -EINTR;
+ status = NFNL_BATCH_FAILURE;
+ goto done;
+ }
+
memset(&extack, 0, sizeof(extack));
nlh = nlmsg_hdr(skb);
err = 0;
@@ -379,10 +402,16 @@ replay:
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
- struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+ struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
+ /* Sanity-check NFTA_MAX_ATTR */
+ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+ err = -ENOMEM;
+ goto ack;
+ }
+
err = nla_parse(cda, ss->cb[cb_id].attr_count, attr,
attrlen, ss->cb[cb_id].policy, NULL);
if (err < 0)
@@ -441,10 +470,19 @@ done:
kfree_skb(skb);
goto replay;
} else if (status == NFNL_BATCH_DONE) {
- ss->commit(net, oskb);
+ err = ss->commit(net, oskb);
+ if (err == -EAGAIN) {
+ status |= NFNL_BATCH_REPLAY;
+ goto done;
+ } else if (err) {
+ ss->abort(net, oskb);
+ netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
+ }
} else {
ss->abort(net, oskb);
}
+ if (ss->cleanup)
+ ss->cleanup(net);
nfnl_err_deliver(&err_list, oskb);
nfnl_unlock(subsys_id);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index c14822b9729f..332c69d27b47 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -37,7 +37,6 @@
#include <net/sock.h>
#include <net/netfilter/nf_log.h>
#include <net/netns/generic.h>
-#include <net/netfilter/nfnetlink_log.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
@@ -47,6 +46,7 @@
#include "../bridge/br_private.h"
#endif
+#define NFULNL_COPY_DISABLED 0xff
#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE
#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */
@@ -618,7 +618,7 @@ static const struct nf_loginfo default_loginfo = {
};
/* log handler for internal netfilter logging api */
-void
+static void
nfulnl_log_packet(struct net *net,
u_int8_t pf,
unsigned int hooknum,
@@ -633,7 +633,7 @@ nfulnl_log_packet(struct net *net,
struct nfulnl_instance *inst;
const struct nf_loginfo *li;
unsigned int qthreshold;
- unsigned int plen;
+ unsigned int plen = 0;
struct nfnl_log_net *log = nfnl_log_pernet(net);
const struct nfnl_ct_hook *nfnl_ct = NULL;
struct nf_conn *ct = NULL;
@@ -648,7 +648,6 @@ nfulnl_log_packet(struct net *net,
if (!inst)
return;
- plen = 0;
if (prefix)
plen = strlen(prefix) + 1;
@@ -760,7 +759,6 @@ alloc_failure:
/* FIXME: statistics */
goto unlock_and_release;
}
-EXPORT_SYMBOL_GPL(nfulnl_log_packet);
static int
nfulnl_rcv_nl_event(struct notifier_block *this,
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 494a9ab35cb6..4ccd2988f9db 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -227,6 +227,25 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
return entry;
}
+static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+{
+ struct nf_ct_hook *ct_hook;
+ int err;
+
+ if (verdict == NF_ACCEPT ||
+ verdict == NF_STOP) {
+ rcu_read_lock();
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook) {
+ err = ct_hook->update(entry->state.net, entry->skb);
+ if (err < 0)
+ verdict = NF_DROP;
+ }
+ rcu_read_unlock();
+ }
+ nf_reinject(entry, verdict);
+}
+
static void
nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
{
@@ -237,7 +256,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
if (!cmpfn || cmpfn(entry, data)) {
list_del(&entry->list);
queue->queue_total--;
- nf_reinject(entry, NF_DROP);
+ nfqnl_reinject(entry, NF_DROP);
}
}
spin_unlock_bh(&queue->lock);
@@ -686,7 +705,7 @@ err_out_free_nskb:
err_out_unlock:
spin_unlock_bh(&queue->lock);
if (failopen)
- nf_reinject(entry, NF_ACCEPT);
+ nfqnl_reinject(entry, NF_ACCEPT);
err_out:
return err;
}
@@ -1085,7 +1104,8 @@ static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
list_for_each_entry_safe(entry, tmp, &batch_list, list) {
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
- nf_reinject(entry, verdict);
+
+ nfqnl_reinject(entry, verdict);
}
return 0;
}
@@ -1208,7 +1228,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
- nf_reinject(entry, verdict);
+ nfqnl_reinject(entry, verdict);
return 0;
}
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 1d99a1efdafc..8d1ff654e5af 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -611,10 +611,10 @@ nla_put_failure:
return -1;
}
-static int nfnl_compat_get(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const tb[],
+ struct netlink_ext_ack *extack)
{
int ret = 0, target;
struct nfgenmsg *nfmsg;
@@ -653,16 +653,21 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl,
return -EINVAL;
}
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+
+ rcu_read_unlock();
try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
rev, target, &ret),
fmt, name);
-
if (ret < 0)
- return ret;
+ goto out_put;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL)
- return -ENOMEM;
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
/* include the best revision for this extension in the message */
if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
@@ -672,14 +677,16 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl,
nfmsg->nfgen_family,
name, ret, target) <= 0) {
kfree_skb(skb2);
- return -ENOSPC;
+ goto out_put;
}
ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
MSG_DONTWAIT);
if (ret > 0)
ret = 0;
-
+out_put:
+ rcu_read_lock();
+ module_put(THIS_MODULE);
return ret == -EAGAIN ? -ENOBUFS : ret;
}
@@ -691,7 +698,7 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
};
static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
- [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get,
+ [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu,
.attr_count = NFTA_COMPAT_MAX,
.policy = nfnl_compat_policy_get },
};
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
new file mode 100644
index 000000000000..50c068d660e5
--- /dev/null
+++ b/net/netfilter/nft_connlimit.c
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_count.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+struct nft_connlimit {
+ spinlock_t lock;
+ struct hlist_head hhead;
+ u32 limit;
+ bool invert;
+};
+
+static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ const struct nft_set_ext *ext)
+{
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+ const struct nf_conntrack_tuple *tuple_ptr;
+ struct nf_conntrack_tuple tuple;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ unsigned int count;
+ bool addit;
+
+ tuple_ptr = &tuple;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+ if (ct != NULL) {
+ tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ zone = nf_ct_zone(ct);
+ } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb),
+ nft_pf(pkt), nft_net(pkt), &tuple)) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ spin_lock_bh(&priv->lock);
+ count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone,
+ &addit);
+
+ if (!addit)
+ goto out;
+
+ if (!nf_conncount_add(&priv->hhead, tuple_ptr)) {
+ regs->verdict.code = NF_DROP;
+ spin_unlock_bh(&priv->lock);
+ return;
+ }
+ count++;
+out:
+ spin_unlock_bh(&priv->lock);
+
+ if ((count > priv->limit) ^ priv->invert) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+}
+
+static int nft_connlimit_do_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_connlimit *priv)
+{
+ bool invert = false;
+ u32 flags, limit;
+
+ if (!tb[NFTA_CONNLIMIT_COUNT])
+ return -EINVAL;
+
+ limit = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_COUNT]));
+
+ if (tb[NFTA_CONNLIMIT_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_FLAGS]));
+ if (flags & ~NFT_CONNLIMIT_F_INV)
+ return -EOPNOTSUPP;
+ if (flags & NFT_CONNLIMIT_F_INV)
+ invert = true;
+ }
+
+ spin_lock_init(&priv->lock);
+ INIT_HLIST_HEAD(&priv->hhead);
+ priv->limit = limit;
+ priv->invert = invert;
+
+ return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
+ struct nft_connlimit *priv)
+{
+ nf_ct_netns_put(ctx->net, ctx->family);
+ nf_conncount_cache_free(&priv->hhead);
+}
+
+static int nft_connlimit_do_dump(struct sk_buff *skb,
+ struct nft_connlimit *priv)
+{
+ if (nla_put_be32(skb, NFTA_CONNLIMIT_COUNT, htonl(priv->limit)))
+ goto nla_put_failure;
+ if (priv->invert &&
+ nla_put_be32(skb, NFTA_CONNLIMIT_FLAGS, htonl(NFT_CONNLIMIT_F_INV)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline void nft_connlimit_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ nft_connlimit_do_eval(priv, regs, pkt, NULL);
+}
+
+static int nft_connlimit_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ return nft_connlimit_do_init(ctx, tb, priv);
+}
+
+static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ nft_connlimit_do_destroy(ctx, priv);
+}
+
+static int nft_connlimit_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ return nft_connlimit_do_dump(skb, priv);
+}
+
+static const struct nla_policy nft_connlimit_policy[NFTA_CONNLIMIT_MAX + 1] = {
+ [NFTA_CONNLIMIT_COUNT] = { .type = NLA_U32 },
+ [NFTA_CONNLIMIT_FLAGS] = { .type = NLA_U32 },
+};
+
+static struct nft_object_type nft_connlimit_obj_type;
+static const struct nft_object_ops nft_connlimit_obj_ops = {
+ .type = &nft_connlimit_obj_type,
+ .size = sizeof(struct nft_connlimit),
+ .eval = nft_connlimit_obj_eval,
+ .init = nft_connlimit_obj_init,
+ .destroy = nft_connlimit_obj_destroy,
+ .dump = nft_connlimit_obj_dump,
+};
+
+static struct nft_object_type nft_connlimit_obj_type __read_mostly = {
+ .type = NFT_OBJECT_CONNLIMIT,
+ .ops = &nft_connlimit_obj_ops,
+ .maxattr = NFTA_CONNLIMIT_MAX,
+ .policy = nft_connlimit_policy,
+ .owner = THIS_MODULE,
+};
+
+static void nft_connlimit_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+
+ nft_connlimit_do_eval(priv, regs, pkt, NULL);
+}
+
+static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+
+ return nft_connlimit_do_dump(skb, priv);
+}
+
+static int nft_connlimit_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+
+ return nft_connlimit_do_init(ctx, tb, priv);
+}
+
+static void nft_connlimit_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+
+ nft_connlimit_do_destroy(ctx, priv);
+}
+
+static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_connlimit *priv_dst = nft_expr_priv(dst);
+ struct nft_connlimit *priv_src = nft_expr_priv(src);
+
+ spin_lock_init(&priv_dst->lock);
+ INIT_HLIST_HEAD(&priv_dst->hhead);
+ priv_dst->limit = priv_src->limit;
+ priv_dst->invert = priv_src->invert;
+
+ return 0;
+}
+
+static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+
+ nf_conncount_cache_free(&priv->hhead);
+}
+
+static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+ bool addit, ret;
+
+ spin_lock_bh(&priv->lock);
+ nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit);
+
+ ret = hlist_empty(&priv->hhead);
+ spin_unlock_bh(&priv->lock);
+
+ return ret;
+}
+
+static struct nft_expr_type nft_connlimit_type;
+static const struct nft_expr_ops nft_connlimit_ops = {
+ .type = &nft_connlimit_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_connlimit)),
+ .eval = nft_connlimit_eval,
+ .init = nft_connlimit_init,
+ .destroy = nft_connlimit_destroy,
+ .clone = nft_connlimit_clone,
+ .destroy_clone = nft_connlimit_destroy_clone,
+ .dump = nft_connlimit_dump,
+ .gc = nft_connlimit_gc,
+};
+
+static struct nft_expr_type nft_connlimit_type __read_mostly = {
+ .name = "connlimit",
+ .ops = &nft_connlimit_ops,
+ .policy = nft_connlimit_policy,
+ .maxattr = NFTA_CONNLIMIT_MAX,
+ .flags = NFT_EXPR_STATEFUL | NFT_EXPR_GC,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_connlimit_module_init(void)
+{
+ int err;
+
+ err = nft_register_obj(&nft_connlimit_obj_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_connlimit_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_obj(&nft_connlimit_obj_type);
+ return err;
+}
+
+static void __exit nft_connlimit_module_exit(void)
+{
+ nft_unregister_expr(&nft_connlimit_type);
+ nft_unregister_obj(&nft_connlimit_obj_type);
+}
+
+module_init(nft_connlimit_module_init);
+module_exit(nft_connlimit_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso");
+MODULE_ALIAS_NFT_EXPR("connlimit");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT);
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index eefe3b409925..a61d7edfc290 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -96,7 +96,8 @@ static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv)
free_percpu(priv->counter);
}
-static void nft_counter_obj_destroy(struct nft_object *obj)
+static void nft_counter_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
{
struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
@@ -257,6 +258,7 @@ static const struct nft_expr_ops nft_counter_ops = {
.eval = nft_counter_eval,
.init = nft_counter_init,
.destroy = nft_counter_destroy,
+ .destroy_clone = nft_counter_destroy,
.dump = nft_counter_dump,
.clone = nft_counter_clone,
};
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 5c0de704bad5..1435ffc5f57e 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -826,7 +826,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
return 0;
}
-static void nft_ct_helper_obj_destroy(struct nft_object *obj)
+static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
{
struct nft_ct_helper_obj *priv = nft_obj_data(obj);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 04863fad05dd..4d49529cff61 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -36,7 +36,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
u64 timeout;
void *elem;
- if (set->size && !atomic_add_unless(&set->nelems, 1, set->size))
+ if (!atomic_add_unless(&set->nelems, 1, set->size))
return NULL;
timeout = priv->timeout ? : set->timeout;
@@ -81,7 +81,7 @@ static void nft_dynset_eval(const struct nft_expr *expr,
if (priv->op == NFT_DYNSET_OP_UPDATE &&
nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
timeout = priv->timeout ? : set->timeout;
- *nft_set_ext_expiration(ext) = jiffies + timeout;
+ *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout;
}
if (sexpr != NULL)
@@ -195,6 +195,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
err = -EOPNOTSUPP;
if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL))
goto err1;
+
+ if (priv->expr->ops->type->flags & NFT_EXPR_GC) {
+ if (set->flags & NFT_SET_TIMEOUT)
+ goto err1;
+ if (!set->ops->gc_init)
+ goto err1;
+ set->ops->gc_init(set);
+ }
+
} else if (set->flags & NFT_SET_EVAL)
return -EINVAL;
@@ -216,6 +225,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (err < 0)
goto err1;
+ if (set->size == 0)
+ set->size = 0xffff;
+
priv->set = set;
return 0;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47ec1046ad11..a940c9fd9045 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -10,11 +10,10 @@
#include <asm/unaligned.h>
#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
#include <net/tcp.h>
@@ -353,7 +352,6 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
-static struct nft_expr_type nft_exthdr_type;
static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -407,27 +405,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
return ERR_PTR(-EOPNOTSUPP);
}
-static struct nft_expr_type nft_exthdr_type __read_mostly = {
+struct nft_expr_type nft_exthdr_type __read_mostly = {
.name = "exthdr",
.select_ops = nft_exthdr_select_ops,
.policy = nft_exthdr_policy,
.maxattr = NFTA_EXTHDR_MAX,
.owner = THIS_MODULE,
};
-
-static int __init nft_exthdr_module_init(void)
-{
- return nft_register_expr(&nft_exthdr_type);
-}
-
-static void __exit nft_exthdr_module_exit(void)
-{
- nft_unregister_expr(&nft_exthdr_type);
-}
-
-module_init(nft_exthdr_module_init);
-module_exit(nft_exthdr_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("exthdr");
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index b65829b2be22..d6bab8c3cbb0 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -142,9 +142,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx,
if (!tb[NFTA_FLOW_TABLE_NAME])
return -EINVAL;
- flowtable = nf_tables_flowtable_lookup(ctx->table,
- tb[NFTA_FLOW_TABLE_NAME],
- genmask);
+ flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME],
+ genmask);
if (IS_ERR(flowtable))
return PTR_ERR(flowtable);
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index ce13a50b9189..8abb9891cdf2 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -12,8 +12,12 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_dup_netdev.h>
+#include <net/neighbour.h>
+#include <net/ip.h>
struct nft_fwd_netdev {
enum nft_registers sreg_dev:8;
@@ -32,6 +36,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
[NFTA_FWD_SREG_DEV] = { .type = NLA_U32 },
+ [NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 },
+ [NFTA_FWD_NFPROTO] = { .type = NLA_U32 },
};
static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -62,7 +68,133 @@ nla_put_failure:
return -1;
}
+struct nft_fwd_neigh {
+ enum nft_registers sreg_dev:8;
+ enum nft_registers sreg_addr:8;
+ u8 nfproto;
+};
+
+static void nft_fwd_neigh_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+ void *addr = &regs->data[priv->sreg_addr];
+ int oif = regs->data[priv->sreg_dev];
+ unsigned int verdict = NF_STOLEN;
+ struct sk_buff *skb = pkt->skb;
+ struct net_device *dev;
+ int neigh_table;
+
+ switch (priv->nfproto) {
+ case NFPROTO_IPV4: {
+ struct iphdr *iph;
+
+ if (skb->protocol != htons(ETH_P_IP)) {
+ verdict = NFT_BREAK;
+ goto out;
+ }
+ if (skb_try_make_writable(skb, sizeof(*iph))) {
+ verdict = NF_DROP;
+ goto out;
+ }
+ iph = ip_hdr(skb);
+ ip_decrease_ttl(iph);
+ neigh_table = NEIGH_ARP_TABLE;
+ break;
+ }
+ case NFPROTO_IPV6: {
+ struct ipv6hdr *ip6h;
+
+ if (skb->protocol != htons(ETH_P_IPV6)) {
+ verdict = NFT_BREAK;
+ goto out;
+ }
+ if (skb_try_make_writable(skb, sizeof(*ip6h))) {
+ verdict = NF_DROP;
+ goto out;
+ }
+ ip6h = ipv6_hdr(skb);
+ ip6h->hop_limit--;
+ neigh_table = NEIGH_ND_TABLE;
+ break;
+ }
+ default:
+ verdict = NFT_BREAK;
+ goto out;
+ }
+
+ dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+ if (dev == NULL)
+ return;
+
+ skb->dev = dev;
+ neigh_xmit(neigh_table, dev, addr, skb);
+out:
+ regs->verdict.code = verdict;
+}
+
+static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+ unsigned int addr_len;
+ int err;
+
+ if (!tb[NFTA_FWD_SREG_DEV] ||
+ !tb[NFTA_FWD_SREG_ADDR] ||
+ !tb[NFTA_FWD_NFPROTO])
+ return -EINVAL;
+
+ priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
+ priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
+ priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
+
+ switch (priv->nfproto) {
+ case NFPROTO_IPV4:
+ addr_len = sizeof(struct in_addr);
+ break;
+ case NFPROTO_IPV6:
+ addr_len = sizeof(struct in6_addr);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ if (err < 0)
+ return err;
+
+ return nft_validate_register_load(priv->sreg_addr, addr_len);
+}
+
+static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
+
+static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) ||
+ nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) ||
+ nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
static struct nft_expr_type nft_fwd_netdev_type;
+static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
+ .type = &nft_fwd_netdev_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)),
+ .eval = nft_fwd_neigh_eval,
+ .init = nft_fwd_neigh_init,
+ .dump = nft_fwd_neigh_dump,
+};
+
static const struct nft_expr_ops nft_fwd_netdev_ops = {
.type = &nft_fwd_netdev_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)),
@@ -71,10 +203,22 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
.dump = nft_fwd_netdev_dump,
};
+static const struct nft_expr_ops *
+nft_fwd_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_FWD_SREG_ADDR])
+ return &nft_fwd_neigh_netdev_ops;
+ if (tb[NFTA_FWD_SREG_DEV])
+ return &nft_fwd_netdev_ops;
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
static struct nft_expr_type nft_fwd_netdev_type __read_mostly = {
.family = NFPROTO_NETDEV,
.name = "fwd",
- .ops = &nft_fwd_netdev_ops,
+ .select_ops = nft_fwd_select_ops,
.policy = nft_fwd_netdev_policy,
.maxattr = NFTA_FWD_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 24f2f7567ddb..c2d237144f74 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -25,6 +25,7 @@ struct nft_jhash {
u32 modulus;
u32 seed;
u32 offset;
+ struct nft_set *map;
};
static void nft_jhash_eval(const struct nft_expr *expr,
@@ -35,14 +36,39 @@ static void nft_jhash_eval(const struct nft_expr *expr,
const void *data = &regs->data[priv->sreg];
u32 h;
- h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus);
+ h = reciprocal_scale(jhash(data, priv->len, priv->seed),
+ priv->modulus);
+
regs->data[priv->dreg] = h + priv->offset;
}
+static void nft_jhash_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_jhash *priv = nft_expr_priv(expr);
+ const void *data = &regs->data[priv->sreg];
+ const struct nft_set *map = priv->map;
+ const struct nft_set_ext *ext;
+ u32 result;
+ bool found;
+
+ result = reciprocal_scale(jhash(data, priv->len, priv->seed),
+ priv->modulus) + priv->offset;
+
+ found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+ if (!found)
+ return;
+
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), map->dlen);
+}
+
struct nft_symhash {
enum nft_registers dreg:8;
u32 modulus;
u32 offset;
+ struct nft_set *map;
};
static void nft_symhash_eval(const struct nft_expr *expr,
@@ -58,6 +84,28 @@ static void nft_symhash_eval(const struct nft_expr *expr,
regs->data[priv->dreg] = h + priv->offset;
}
+static void nft_symhash_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_symhash *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ const struct nft_set *map = priv->map;
+ const struct nft_set_ext *ext;
+ u32 result;
+ bool found;
+
+ result = reciprocal_scale(__skb_get_hash_symmetric(skb),
+ priv->modulus) + priv->offset;
+
+ found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+ if (!found)
+ return;
+
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), map->dlen);
+}
+
static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
[NFTA_HASH_SREG] = { .type = NLA_U32 },
[NFTA_HASH_DREG] = { .type = NLA_U32 },
@@ -66,6 +114,9 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
[NFTA_HASH_SEED] = { .type = NLA_U32 },
[NFTA_HASH_OFFSET] = { .type = NLA_U32 },
[NFTA_HASH_TYPE] = { .type = NLA_U32 },
+ [NFTA_HASH_SET_NAME] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
+ [NFTA_HASH_SET_ID] = { .type = NLA_U32 },
};
static int nft_jhash_init(const struct nft_ctx *ctx,
@@ -97,7 +148,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
priv->len = len;
priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
- if (priv->modulus <= 1)
+ if (priv->modulus < 1)
return -ERANGE;
if (priv->offset + priv->modulus - 1 < priv->offset)
@@ -115,6 +166,20 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
NFT_DATA_VALUE, sizeof(u32));
}
+static int nft_jhash_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_jhash *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+
+ nft_jhash_init(ctx, expr, tb);
+ priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+ tb[NFTA_HASH_SET_NAME],
+ tb[NFTA_HASH_SET_ID], genmask);
+ return PTR_ERR_OR_ZERO(priv->map);
+}
+
static int nft_symhash_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -141,6 +206,20 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
NFT_DATA_VALUE, sizeof(u32));
}
+static int nft_symhash_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_jhash *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+
+ nft_symhash_init(ctx, expr, tb);
+ priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+ tb[NFTA_HASH_SET_NAME],
+ tb[NFTA_HASH_SET_ID], genmask);
+ return PTR_ERR_OR_ZERO(priv->map);
+}
+
static int nft_jhash_dump(struct sk_buff *skb,
const struct nft_expr *expr)
{
@@ -168,6 +247,18 @@ nla_put_failure:
return -1;
}
+static int nft_jhash_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_jhash *priv = nft_expr_priv(expr);
+
+ if (nft_jhash_dump(skb, expr) ||
+ nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
+ return -1;
+
+ return 0;
+}
+
static int nft_symhash_dump(struct sk_buff *skb,
const struct nft_expr *expr)
{
@@ -188,6 +279,18 @@ nla_put_failure:
return -1;
}
+static int nft_symhash_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_symhash *priv = nft_expr_priv(expr);
+
+ if (nft_symhash_dump(skb, expr) ||
+ nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
+ return -1;
+
+ return 0;
+}
+
static struct nft_expr_type nft_hash_type;
static const struct nft_expr_ops nft_jhash_ops = {
.type = &nft_hash_type,
@@ -197,6 +300,14 @@ static const struct nft_expr_ops nft_jhash_ops = {
.dump = nft_jhash_dump,
};
+static const struct nft_expr_ops nft_jhash_map_ops = {
+ .type = &nft_hash_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_jhash)),
+ .eval = nft_jhash_map_eval,
+ .init = nft_jhash_map_init,
+ .dump = nft_jhash_map_dump,
+};
+
static const struct nft_expr_ops nft_symhash_ops = {
.type = &nft_hash_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
@@ -205,6 +316,14 @@ static const struct nft_expr_ops nft_symhash_ops = {
.dump = nft_symhash_dump,
};
+static const struct nft_expr_ops nft_symhash_map_ops = {
+ .type = &nft_hash_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
+ .eval = nft_symhash_map_eval,
+ .init = nft_symhash_map_init,
+ .dump = nft_symhash_map_dump,
+};
+
static const struct nft_expr_ops *
nft_hash_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -217,8 +336,12 @@ nft_hash_select_ops(const struct nft_ctx *ctx,
type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE]));
switch (type) {
case NFT_HASH_SYM:
+ if (tb[NFTA_HASH_SET_NAME])
+ return &nft_symhash_map_ops;
return &nft_symhash_ops;
case NFT_HASH_JENKINS:
+ if (tb[NFTA_HASH_SET_NAME])
+ return &nft_jhash_map_ops;
return &nft_jhash_ops;
default:
break;
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index aa87ff8beae8..15adf8ca82c3 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -17,12 +17,6 @@
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
-struct nft_immediate_expr {
- struct nft_data data;
- enum nft_registers dreg:8;
- u8 dlen;
-};
-
static void nft_immediate_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -101,12 +95,27 @@ nla_put_failure:
static int nft_immediate_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_data **d)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data;
+ int err;
- if (priv->dreg == NFT_REG_VERDICT)
- *data = &priv->data;
+ if (priv->dreg != NFT_REG_VERDICT)
+ return 0;
+
+ data = &priv->data;
+
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ err = nft_chain_validate(ctx, data->verdict.chain);
+ if (err < 0)
+ return err;
+ break;
+ default:
+ break;
+ }
return 0;
}
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index a27be36dc0af..7eef1cffbf1b 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -9,12 +9,15 @@
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
+#include <linux/audit.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_log.h>
#include <linux/netdevice.h>
@@ -26,12 +29,93 @@ struct nft_log {
char *prefix;
};
+static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+
+ ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph);
+ if (!ih)
+ return false;
+
+ audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu",
+ &ih->saddr, &ih->daddr, ih->protocol);
+
+ return true;
+}
+
+static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct ipv6hdr _ip6h;
+ const struct ipv6hdr *ih;
+ u8 nexthdr;
+ __be16 frag_off;
+
+ ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+ if (!ih)
+ return false;
+
+ nexthdr = ih->nexthdr;
+ ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off);
+
+ audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+ &ih->saddr, &ih->daddr, nexthdr);
+
+ return true;
+}
+
+static void nft_log_eval_audit(const struct nft_pktinfo *pkt)
+{
+ struct sk_buff *skb = pkt->skb;
+ struct audit_buffer *ab;
+ int fam = -1;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "mark=%#x", skb->mark);
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_BRIDGE:
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+ break;
+ case htons(ETH_P_IPV6):
+ fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+ break;
+ }
+ break;
+ case NFPROTO_IPV4:
+ fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+ break;
+ case NFPROTO_IPV6:
+ fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+ break;
+ }
+
+ if (fam == -1)
+ audit_log_format(ab, " saddr=? daddr=? proto=-1");
+
+ audit_log_end(ab);
+}
+
static void nft_log_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_log *priv = nft_expr_priv(expr);
+ if (priv->loginfo.type == NF_LOG_TYPE_LOG &&
+ priv->loginfo.u.log.level == LOGLEVEL_AUDIT) {
+ nft_log_eval_audit(pkt);
+ return;
+ }
+
nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
priv->prefix);
@@ -84,7 +168,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
} else {
li->u.log.level = LOGLEVEL_WARNING;
}
- if (li->u.log.level > LOGLEVEL_DEBUG) {
+ if (li->u.log.level > LOGLEVEL_AUDIT) {
err = -EINVAL;
goto err1;
}
@@ -112,6 +196,9 @@ static int nft_log_init(const struct nft_ctx *ctx,
break;
}
+ if (li->u.log.level == LOGLEVEL_AUDIT)
+ return 0;
+
err = nf_logger_find_get(ctx->family, li->type);
if (err < 0)
goto err1;
@@ -133,6 +220,9 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
if (priv->prefix != nft_log_null_prefix)
kfree(priv->prefix);
+ if (li->u.log.level == LOGLEVEL_AUDIT)
+ return;
+
nf_logger_put(ctx->family, li->type);
}
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index f52da5e2199f..42e6fadf1417 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -149,6 +149,52 @@ nla_put_failure:
return -1;
}
+static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_data *data;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+ return 0;
+
+ data = nft_set_ext_data(ext);
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ return nft_chain_validate(ctx, data->verdict.chain);
+ default:
+ return 0;
+ }
+}
+
+static int nft_lookup_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **d)
+{
+ const struct nft_lookup *priv = nft_expr_priv(expr);
+ struct nft_set_iter iter;
+
+ if (!(priv->set->flags & NFT_SET_MAP) ||
+ priv->set->dtype != NFT_DATA_VERDICT)
+ return 0;
+
+ iter.genmask = nft_genmask_next(ctx->net);
+ iter.skip = 0;
+ iter.count = 0;
+ iter.err = 0;
+ iter.fn = nft_lookup_validate_setelem;
+
+ priv->set->ops->walk(ctx, priv->set, &iter);
+ if (iter.err < 0)
+ return iter.err;
+
+ return 0;
+}
+
static const struct nft_expr_ops nft_lookup_ops = {
.type = &nft_lookup_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -156,6 +202,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
.init = nft_lookup_init,
.destroy = nft_lookup_destroy,
.dump = nft_lookup_dump,
+ .validate = nft_lookup_validate,
};
struct nft_expr_type nft_lookup_type __read_mostly = {
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 204af9899482..1105a23bda5e 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -1,5 +1,7 @@
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2014 Intel Corporation
+ * Author: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -9,8 +11,6 @@
*/
#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
@@ -24,21 +24,35 @@
#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
-#include <net/netfilter/nft_meta.h>
#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
+struct nft_meta {
+ enum nft_meta_keys key:8;
+ union {
+ enum nft_registers dreg:8;
+ enum nft_registers sreg:8;
+ };
+};
+
static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
-void nft_meta_get_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+#ifdef CONFIG_NF_TABLES_BRIDGE
+#include "../bridge/br_private.h"
+#endif
+
+static void nft_meta_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
struct sock *sk;
u32 *dest = &regs->data[priv->dreg];
+#ifdef CONFIG_NF_TABLES_BRIDGE
+ const struct net_bridge_port *p;
+#endif
switch (priv->key) {
case NFT_META_LEN:
@@ -215,6 +229,18 @@ void nft_meta_get_eval(const struct nft_expr *expr,
nft_reg_store8(dest, !!skb->sp);
break;
#endif
+#ifdef CONFIG_NF_TABLES_BRIDGE
+ case NFT_META_BRI_IIFNAME:
+ if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
+ goto err;
+ strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
+ return;
+ case NFT_META_BRI_OIFNAME:
+ if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
+ goto err;
+ strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
+ return;
+#endif
default:
WARN_ON(1);
goto err;
@@ -224,11 +250,10 @@ void nft_meta_get_eval(const struct nft_expr *expr,
err:
regs->verdict.code = NFT_BREAK;
}
-EXPORT_SYMBOL_GPL(nft_meta_get_eval);
-void nft_meta_set_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static void nft_meta_set_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_meta *meta = nft_expr_priv(expr);
struct sk_buff *skb = pkt->skb;
@@ -260,18 +285,16 @@ void nft_meta_set_eval(const struct nft_expr *expr,
WARN_ON(1);
}
}
-EXPORT_SYMBOL_GPL(nft_meta_set_eval);
-const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
[NFTA_META_DREG] = { .type = NLA_U32 },
[NFTA_META_KEY] = { .type = NLA_U32 },
[NFTA_META_SREG] = { .type = NLA_U32 },
};
-EXPORT_SYMBOL_GPL(nft_meta_policy);
-int nft_meta_get_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_meta_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int len;
@@ -320,6 +343,14 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
len = sizeof(u8);
break;
#endif
+#ifdef CONFIG_NF_TABLES_BRIDGE
+ case NFT_META_BRI_IIFNAME:
+ case NFT_META_BRI_OIFNAME:
+ if (ctx->family != NFPROTO_BRIDGE)
+ return -EOPNOTSUPP;
+ len = IFNAMSIZ;
+ break;
+#endif
default:
return -EOPNOTSUPP;
}
@@ -328,7 +359,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
return nft_validate_register_store(ctx, priv->dreg, NULL,
NFT_DATA_VALUE, len);
}
-EXPORT_SYMBOL_GPL(nft_meta_get_init);
static int nft_meta_get_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
@@ -362,9 +392,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
#endif
}
-int nft_meta_set_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_meta_set_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
@@ -390,11 +420,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
return nft_chain_validate_hooks(ctx->chain, hooks);
}
-EXPORT_SYMBOL_GPL(nft_meta_set_validate);
-int nft_meta_set_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_meta_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int len;
@@ -426,10 +455,9 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
return 0;
}
-EXPORT_SYMBOL_GPL(nft_meta_set_init);
-int nft_meta_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+static int nft_meta_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -442,10 +470,8 @@ int nft_meta_get_dump(struct sk_buff *skb,
nla_put_failure:
return -1;
}
-EXPORT_SYMBOL_GPL(nft_meta_get_dump);
-int nft_meta_set_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -459,19 +485,16 @@ int nft_meta_set_dump(struct sk_buff *skb,
nla_put_failure:
return -1;
}
-EXPORT_SYMBOL_GPL(nft_meta_set_dump);
-void nft_meta_set_destroy(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_meta_set_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
if (priv->key == NFT_META_NFTRACE)
static_branch_dec(&nft_trace_enabled);
}
-EXPORT_SYMBOL_GPL(nft_meta_set_destroy);
-static struct nft_expr_type nft_meta_type;
static const struct nft_expr_ops nft_meta_get_ops = {
.type = &nft_meta_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
@@ -510,27 +533,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
return ERR_PTR(-EINVAL);
}
-static struct nft_expr_type nft_meta_type __read_mostly = {
+struct nft_expr_type nft_meta_type __read_mostly = {
.name = "meta",
.select_ops = nft_meta_select_ops,
.policy = nft_meta_policy,
.maxattr = NFTA_META_MAX,
.owner = THIS_MODULE,
};
-
-static int __init nft_meta_module_init(void)
-{
- return nft_register_expr(&nft_meta_type);
-}
-
-static void __exit nft_meta_module_exit(void)
-{
- nft_unregister_expr(&nft_meta_type);
-}
-
-module_init(nft_meta_module_init);
-module_exit(nft_meta_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("meta");
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 1f36954c2ba9..c15807d10b91 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -43,7 +43,7 @@ static void nft_nat_eval(const struct nft_expr *expr,
const struct nft_nat *priv = nft_expr_priv(expr);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
memset(&range, 0, sizeof(range));
if (priv->sreg_addr_min) {
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 5a3a52c71545..1f4d0854cf70 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -24,13 +24,11 @@ struct nft_ng_inc {
u32 modulus;
atomic_t counter;
u32 offset;
+ struct nft_set *map;
};
-static void nft_ng_inc_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
{
- struct nft_ng_inc *priv = nft_expr_priv(expr);
u32 nval, oval;
do {
@@ -38,7 +36,36 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
- regs->data[priv->dreg] = nval + priv->offset;
+ return nval + priv->offset;
+}
+
+static void nft_ng_inc_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ regs->data[priv->dreg] = nft_ng_inc_gen(priv);
+}
+
+static void nft_ng_inc_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_ng_inc *priv = nft_expr_priv(expr);
+ const struct nft_set *map = priv->map;
+ const struct nft_set_ext *ext;
+ u32 result;
+ bool found;
+
+ result = nft_ng_inc_gen(priv);
+ found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+
+ if (!found)
+ return;
+
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), map->dlen);
}
static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
@@ -46,6 +73,9 @@ static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
[NFTA_NG_MODULUS] = { .type = NLA_U32 },
[NFTA_NG_TYPE] = { .type = NLA_U32 },
[NFTA_NG_OFFSET] = { .type = NLA_U32 },
+ [NFTA_NG_SET_NAME] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
+ [NFTA_NG_SET_ID] = { .type = NLA_U32 },
};
static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -71,6 +101,22 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
NFT_DATA_VALUE, sizeof(u32));
}
+static int nft_ng_inc_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ng_inc *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+
+ nft_ng_inc_init(ctx, expr, tb);
+
+ priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+ tb[NFTA_NG_SET_NAME],
+ tb[NFTA_NG_SET_ID], genmask);
+
+ return PTR_ERR_OR_ZERO(priv->map);
+}
+
static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
u32 modulus, enum nft_ng_types type, u32 offset)
{
@@ -97,22 +143,63 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static int nft_ng_inc_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ if (nft_ng_dump(skb, priv->dreg, priv->modulus,
+ NFT_NG_INCREMENTAL, priv->offset) ||
+ nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
struct nft_ng_random {
enum nft_registers dreg:8;
u32 modulus;
u32 offset;
+ struct nft_set *map;
};
+static u32 nft_ng_random_gen(struct nft_ng_random *priv)
+{
+ struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
+
+ return reciprocal_scale(prandom_u32_state(state), priv->modulus) +
+ priv->offset;
+}
+
static void nft_ng_random_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
struct nft_ng_random *priv = nft_expr_priv(expr);
- struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
- u32 val;
- val = reciprocal_scale(prandom_u32_state(state), priv->modulus);
- regs->data[priv->dreg] = val + priv->offset;
+ regs->data[priv->dreg] = nft_ng_random_gen(priv);
+}
+
+static void nft_ng_random_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_ng_random *priv = nft_expr_priv(expr);
+ const struct nft_set *map = priv->map;
+ const struct nft_set_ext *ext;
+ u32 result;
+ bool found;
+
+ result = nft_ng_random_gen(priv);
+ found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+ if (!found)
+ return;
+
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), map->dlen);
}
static int nft_ng_random_init(const struct nft_ctx *ctx,
@@ -139,6 +226,23 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
NFT_DATA_VALUE, sizeof(u32));
}
+static int nft_ng_random_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ng_random *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+
+ nft_ng_random_init(ctx, expr, tb);
+ priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+ tb[NFTA_NG_SET_NAME],
+ tb[NFTA_NG_SET_ID], genmask);
+ if (IS_ERR(priv->map))
+ return PTR_ERR(priv->map);
+
+ return 0;
+}
+
static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_ng_random *priv = nft_expr_priv(expr);
@@ -147,6 +251,22 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static int nft_ng_random_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_random *priv = nft_expr_priv(expr);
+
+ if (nft_ng_dump(skb, priv->dreg, priv->modulus,
+ NFT_NG_RANDOM, priv->offset) ||
+ nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
static struct nft_expr_type nft_ng_type;
static const struct nft_expr_ops nft_ng_inc_ops = {
.type = &nft_ng_type,
@@ -156,6 +276,14 @@ static const struct nft_expr_ops nft_ng_inc_ops = {
.dump = nft_ng_inc_dump,
};
+static const struct nft_expr_ops nft_ng_inc_map_ops = {
+ .type = &nft_ng_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
+ .eval = nft_ng_inc_map_eval,
+ .init = nft_ng_inc_map_init,
+ .dump = nft_ng_inc_map_dump,
+};
+
static const struct nft_expr_ops nft_ng_random_ops = {
.type = &nft_ng_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
@@ -164,6 +292,14 @@ static const struct nft_expr_ops nft_ng_random_ops = {
.dump = nft_ng_random_dump,
};
+static const struct nft_expr_ops nft_ng_random_map_ops = {
+ .type = &nft_ng_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
+ .eval = nft_ng_random_map_eval,
+ .init = nft_ng_random_map_init,
+ .dump = nft_ng_random_map_dump,
+};
+
static const struct nft_expr_ops *
nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
{
@@ -178,8 +314,12 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
switch (type) {
case NFT_NG_INCREMENTAL:
+ if (tb[NFTA_NG_SET_NAME])
+ return &nft_ng_inc_map_ops;
return &nft_ng_inc_ops;
case NFT_NG_RANDOM:
+ if (tb[NFTA_NG_SET_NAME])
+ return &nft_ng_random_map_ops;
return &nft_ng_random_ops;
}
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 0b02407773ad..cdf348f751ec 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -38,8 +38,8 @@ static int nft_objref_init(const struct nft_ctx *ctx,
return -EINVAL;
objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
- obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
- genmask);
+ obj = nft_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
+ genmask);
if (IS_ERR(obj))
return -ENOENT;
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 11a2071b6dd4..76dba9f6b6f6 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -7,8 +7,6 @@
*/
#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
@@ -179,7 +177,6 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
return nft_chain_validate_hooks(ctx->chain, hooks);
}
-static struct nft_expr_type nft_rt_type;
static const struct nft_expr_ops nft_rt_get_ops = {
.type = &nft_rt_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_rt)),
@@ -189,27 +186,10 @@ static const struct nft_expr_ops nft_rt_get_ops = {
.validate = nft_rt_validate,
};
-static struct nft_expr_type nft_rt_type __read_mostly = {
+struct nft_expr_type nft_rt_type __read_mostly = {
.name = "rt",
.ops = &nft_rt_get_ops,
.policy = nft_rt_policy,
.maxattr = NFTA_RT_MAX,
.owner = THIS_MODULE,
};
-
-static int __init nft_rt_module_init(void)
-{
- return nft_register_expr(&nft_rt_type);
-}
-
-static void __exit nft_rt_module_exit(void)
-{
- nft_unregister_expr(&nft_rt_type);
-}
-
-module_init(nft_rt_module_init);
-module_exit(nft_rt_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Anders K. Pedersen <akp@cohaesio.com>");
-MODULE_ALIAS_NFT_EXPR("rt");
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 45fb2752fb63..d6626e01c7ee 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -296,27 +296,23 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-static struct nft_set_type nft_bitmap_type;
-static struct nft_set_ops nft_bitmap_ops __read_mostly = {
- .type = &nft_bitmap_type,
- .privsize = nft_bitmap_privsize,
- .elemsize = offsetof(struct nft_bitmap_elem, ext),
- .estimate = nft_bitmap_estimate,
- .init = nft_bitmap_init,
- .destroy = nft_bitmap_destroy,
- .insert = nft_bitmap_insert,
- .remove = nft_bitmap_remove,
- .deactivate = nft_bitmap_deactivate,
- .flush = nft_bitmap_flush,
- .activate = nft_bitmap_activate,
- .lookup = nft_bitmap_lookup,
- .walk = nft_bitmap_walk,
- .get = nft_bitmap_get,
-};
-
static struct nft_set_type nft_bitmap_type __read_mostly = {
- .ops = &nft_bitmap_ops,
.owner = THIS_MODULE,
+ .ops = {
+ .privsize = nft_bitmap_privsize,
+ .elemsize = offsetof(struct nft_bitmap_elem, ext),
+ .estimate = nft_bitmap_estimate,
+ .init = nft_bitmap_init,
+ .destroy = nft_bitmap_destroy,
+ .insert = nft_bitmap_insert,
+ .remove = nft_bitmap_remove,
+ .deactivate = nft_bitmap_deactivate,
+ .flush = nft_bitmap_flush,
+ .activate = nft_bitmap_activate,
+ .lookup = nft_bitmap_lookup,
+ .walk = nft_bitmap_walk,
+ .get = nft_bitmap_get,
+ },
};
static int __init nft_bitmap_module_init(void)
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index fc9c6d5d64cd..6f9a1365a09f 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -311,8 +311,16 @@ static void nft_rhash_gc(struct work_struct *work)
continue;
}
+ if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) {
+ struct nft_expr *expr = nft_set_ext_expr(&he->ext);
+
+ if (expr->ops->gc &&
+ expr->ops->gc(read_pnet(&set->net), expr))
+ goto gc;
+ }
if (!nft_set_elem_expired(&he->ext))
continue;
+gc:
if (nft_set_elem_mark_busy(&he->ext))
continue;
@@ -339,6 +347,14 @@ static unsigned int nft_rhash_privsize(const struct nlattr * const nla[],
return sizeof(struct nft_rhash);
}
+static void nft_rhash_gc_init(const struct nft_set *set)
+{
+ struct nft_rhash *priv = nft_set_priv(set);
+
+ queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+ nft_set_gc_interval(set));
+}
+
static int nft_rhash_init(const struct nft_set *set,
const struct nft_set_desc *desc,
const struct nlattr * const tb[])
@@ -356,8 +372,8 @@ static int nft_rhash_init(const struct nft_set *set,
INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
if (set->flags & NFT_SET_TIMEOUT)
- queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
- nft_set_gc_interval(set));
+ nft_rhash_gc_init(set);
+
return 0;
}
@@ -605,6 +621,12 @@ static void nft_hash_destroy(const struct nft_set *set)
static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est)
{
+ if (!desc->size)
+ return false;
+
+ if (desc->klen == 4)
+ return false;
+
est->size = sizeof(struct nft_hash) +
nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
desc->size * sizeof(struct nft_hash_elem);
@@ -614,91 +636,101 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-static struct nft_set_type nft_hash_type;
-static struct nft_set_ops nft_rhash_ops __read_mostly = {
- .type = &nft_hash_type,
- .privsize = nft_rhash_privsize,
- .elemsize = offsetof(struct nft_rhash_elem, ext),
- .estimate = nft_rhash_estimate,
- .init = nft_rhash_init,
- .destroy = nft_rhash_destroy,
- .insert = nft_rhash_insert,
- .activate = nft_rhash_activate,
- .deactivate = nft_rhash_deactivate,
- .flush = nft_rhash_flush,
- .remove = nft_rhash_remove,
- .lookup = nft_rhash_lookup,
- .update = nft_rhash_update,
- .walk = nft_rhash_walk,
- .get = nft_rhash_get,
- .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
-};
-
-static struct nft_set_ops nft_hash_ops __read_mostly = {
- .type = &nft_hash_type,
- .privsize = nft_hash_privsize,
- .elemsize = offsetof(struct nft_hash_elem, ext),
- .estimate = nft_hash_estimate,
- .init = nft_hash_init,
- .destroy = nft_hash_destroy,
- .insert = nft_hash_insert,
- .activate = nft_hash_activate,
- .deactivate = nft_hash_deactivate,
- .flush = nft_hash_flush,
- .remove = nft_hash_remove,
- .lookup = nft_hash_lookup,
- .walk = nft_hash_walk,
- .get = nft_hash_get,
- .features = NFT_SET_MAP | NFT_SET_OBJECT,
-};
+static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ if (!desc->size)
+ return false;
-static struct nft_set_ops nft_hash_fast_ops __read_mostly = {
- .type = &nft_hash_type,
- .privsize = nft_hash_privsize,
- .elemsize = offsetof(struct nft_hash_elem, ext),
- .estimate = nft_hash_estimate,
- .init = nft_hash_init,
- .destroy = nft_hash_destroy,
- .insert = nft_hash_insert,
- .activate = nft_hash_activate,
- .deactivate = nft_hash_deactivate,
- .flush = nft_hash_flush,
- .remove = nft_hash_remove,
- .lookup = nft_hash_lookup_fast,
- .walk = nft_hash_walk,
- .get = nft_hash_get,
- .features = NFT_SET_MAP | NFT_SET_OBJECT,
-};
+ if (desc->klen != 4)
+ return false;
-static const struct nft_set_ops *
-nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc,
- u32 flags)
-{
- if (desc->size && !(flags & (NFT_SET_EVAL | NFT_SET_TIMEOUT))) {
- switch (desc->klen) {
- case 4:
- return &nft_hash_fast_ops;
- default:
- return &nft_hash_ops;
- }
- }
+ est->size = sizeof(struct nft_hash) +
+ nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+ desc->size * sizeof(struct nft_hash_elem);
+ est->lookup = NFT_SET_CLASS_O_1;
+ est->space = NFT_SET_CLASS_O_N;
- return &nft_rhash_ops;
+ return true;
}
+static struct nft_set_type nft_rhash_type __read_mostly = {
+ .owner = THIS_MODULE,
+ .features = NFT_SET_MAP | NFT_SET_OBJECT |
+ NFT_SET_TIMEOUT | NFT_SET_EVAL,
+ .ops = {
+ .privsize = nft_rhash_privsize,
+ .elemsize = offsetof(struct nft_rhash_elem, ext),
+ .estimate = nft_rhash_estimate,
+ .init = nft_rhash_init,
+ .gc_init = nft_rhash_gc_init,
+ .destroy = nft_rhash_destroy,
+ .insert = nft_rhash_insert,
+ .activate = nft_rhash_activate,
+ .deactivate = nft_rhash_deactivate,
+ .flush = nft_rhash_flush,
+ .remove = nft_rhash_remove,
+ .lookup = nft_rhash_lookup,
+ .update = nft_rhash_update,
+ .walk = nft_rhash_walk,
+ .get = nft_rhash_get,
+ },
+};
+
static struct nft_set_type nft_hash_type __read_mostly = {
- .select_ops = nft_hash_select_ops,
.owner = THIS_MODULE,
+ .features = NFT_SET_MAP | NFT_SET_OBJECT,
+ .ops = {
+ .privsize = nft_hash_privsize,
+ .elemsize = offsetof(struct nft_hash_elem, ext),
+ .estimate = nft_hash_estimate,
+ .init = nft_hash_init,
+ .destroy = nft_hash_destroy,
+ .insert = nft_hash_insert,
+ .activate = nft_hash_activate,
+ .deactivate = nft_hash_deactivate,
+ .flush = nft_hash_flush,
+ .remove = nft_hash_remove,
+ .lookup = nft_hash_lookup,
+ .walk = nft_hash_walk,
+ .get = nft_hash_get,
+ },
+};
+
+static struct nft_set_type nft_hash_fast_type __read_mostly = {
+ .owner = THIS_MODULE,
+ .features = NFT_SET_MAP | NFT_SET_OBJECT,
+ .ops = {
+ .privsize = nft_hash_privsize,
+ .elemsize = offsetof(struct nft_hash_elem, ext),
+ .estimate = nft_hash_fast_estimate,
+ .init = nft_hash_init,
+ .destroy = nft_hash_destroy,
+ .insert = nft_hash_insert,
+ .activate = nft_hash_activate,
+ .deactivate = nft_hash_deactivate,
+ .flush = nft_hash_flush,
+ .remove = nft_hash_remove,
+ .lookup = nft_hash_lookup_fast,
+ .walk = nft_hash_walk,
+ .get = nft_hash_get,
+ },
};
static int __init nft_hash_module_init(void)
{
- return nft_register_set(&nft_hash_type);
+ if (nft_register_set(&nft_hash_fast_type) ||
+ nft_register_set(&nft_hash_type) ||
+ nft_register_set(&nft_rhash_type))
+ return 1;
+ return 0;
}
static void __exit nft_hash_module_exit(void)
{
+ nft_unregister_set(&nft_rhash_type);
nft_unregister_set(&nft_hash_type);
+ nft_unregister_set(&nft_hash_fast_type);
}
module_init(nft_hash_module_init);
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index e6f08bc5f359..d260ce2d6671 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -22,6 +22,7 @@ struct nft_rbtree {
struct rb_root root;
rwlock_t lock;
seqcount_t count;
+ struct delayed_work gc_work;
};
struct nft_rbtree_elem {
@@ -265,6 +266,7 @@ static void nft_rbtree_activate(const struct net *net,
struct nft_rbtree_elem *rbe = elem->priv;
nft_set_elem_change_active(net, set, &rbe->ext);
+ nft_set_elem_clear_busy(&rbe->ext);
}
static bool nft_rbtree_flush(const struct net *net,
@@ -272,8 +274,12 @@ static bool nft_rbtree_flush(const struct net *net,
{
struct nft_rbtree_elem *rbe = priv;
- nft_set_elem_change_active(net, set, &rbe->ext);
- return true;
+ if (!nft_set_elem_mark_busy(&rbe->ext) ||
+ !nft_is_active(net, &rbe->ext)) {
+ nft_set_elem_change_active(net, set, &rbe->ext);
+ return true;
+ }
+ return false;
}
static void *nft_rbtree_deactivate(const struct net *net,
@@ -347,6 +353,62 @@ cont:
read_unlock_bh(&priv->lock);
}
+static void nft_rbtree_gc(struct work_struct *work)
+{
+ struct nft_set_gc_batch *gcb = NULL;
+ struct rb_node *node, *prev = NULL;
+ struct nft_rbtree_elem *rbe;
+ struct nft_rbtree *priv;
+ struct nft_set *set;
+ int i;
+
+ priv = container_of(work, struct nft_rbtree, gc_work.work);
+ set = nft_set_container_of(priv);
+
+ write_lock_bh(&priv->lock);
+ write_seqcount_begin(&priv->count);
+ for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+
+ if (nft_rbtree_interval_end(rbe)) {
+ prev = node;
+ continue;
+ }
+ if (!nft_set_elem_expired(&rbe->ext))
+ continue;
+ if (nft_set_elem_mark_busy(&rbe->ext))
+ continue;
+
+ gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
+ if (!gcb)
+ goto out;
+
+ atomic_dec(&set->nelems);
+ nft_set_gc_batch_add(gcb, rbe);
+
+ if (prev) {
+ rbe = rb_entry(prev, struct nft_rbtree_elem, node);
+ atomic_dec(&set->nelems);
+ nft_set_gc_batch_add(gcb, rbe);
+ }
+ node = rb_next(node);
+ }
+out:
+ if (gcb) {
+ for (i = 0; i < gcb->head.cnt; i++) {
+ rbe = gcb->elems[i];
+ rb_erase(&rbe->node, &priv->root);
+ }
+ }
+ write_seqcount_end(&priv->count);
+ write_unlock_bh(&priv->lock);
+
+ nft_set_gc_batch_complete(gcb);
+
+ queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+ nft_set_gc_interval(set));
+}
+
static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
{
@@ -362,6 +424,12 @@ static int nft_rbtree_init(const struct nft_set *set,
rwlock_init(&priv->lock);
seqcount_init(&priv->count);
priv->root = RB_ROOT;
+
+ INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
+ if (set->flags & NFT_SET_TIMEOUT)
+ queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+ nft_set_gc_interval(set));
+
return 0;
}
@@ -371,6 +439,7 @@ static void nft_rbtree_destroy(const struct nft_set *set)
struct nft_rbtree_elem *rbe;
struct rb_node *node;
+ cancel_delayed_work_sync(&priv->gc_work);
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -393,28 +462,24 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-static struct nft_set_type nft_rbtree_type;
-static struct nft_set_ops nft_rbtree_ops __read_mostly = {
- .type = &nft_rbtree_type,
- .privsize = nft_rbtree_privsize,
- .elemsize = offsetof(struct nft_rbtree_elem, ext),
- .estimate = nft_rbtree_estimate,
- .init = nft_rbtree_init,
- .destroy = nft_rbtree_destroy,
- .insert = nft_rbtree_insert,
- .remove = nft_rbtree_remove,
- .deactivate = nft_rbtree_deactivate,
- .flush = nft_rbtree_flush,
- .activate = nft_rbtree_activate,
- .lookup = nft_rbtree_lookup,
- .walk = nft_rbtree_walk,
- .get = nft_rbtree_get,
- .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
-};
-
static struct nft_set_type nft_rbtree_type __read_mostly = {
- .ops = &nft_rbtree_ops,
.owner = THIS_MODULE,
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
+ .ops = {
+ .privsize = nft_rbtree_privsize,
+ .elemsize = offsetof(struct nft_rbtree_elem, ext),
+ .estimate = nft_rbtree_estimate,
+ .init = nft_rbtree_init,
+ .destroy = nft_rbtree_destroy,
+ .insert = nft_rbtree_insert,
+ .remove = nft_rbtree_remove,
+ .deactivate = nft_rbtree_deactivate,
+ .flush = nft_rbtree_flush,
+ .activate = nft_rbtree_activate,
+ .lookup = nft_rbtree_lookup,
+ .walk = nft_rbtree_walk,
+ .get = nft_rbtree_get,
+ },
};
static int __init nft_rbtree_module_init(void)
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
new file mode 100644
index 000000000000..f28a0b944087
--- /dev/null
+++ b/net/netfilter/nft_socket.c
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_socket.h>
+#include <net/inet_sock.h>
+#include <net/tcp.h>
+
+struct nft_socket {
+ enum nft_socket_keys key:8;
+ union {
+ enum nft_registers dreg:8;
+ };
+};
+
+static void nft_socket_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_socket *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ struct sock *sk = skb->sk;
+ u32 *dest = &regs->data[priv->dreg];
+
+ if (!sk)
+ switch(nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
+ break;
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+ case NFPROTO_IPV6:
+ sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ if(!sk) {
+ nft_reg_store8(dest, 0);
+ return;
+ }
+
+ /* So that subsequent socket matching not to require other lookups. */
+ skb->sk = sk;
+
+ switch(priv->key) {
+ case NFT_SOCKET_TRANSPARENT:
+ nft_reg_store8(dest, inet_sk_transparent(sk));
+ break;
+ default:
+ WARN_ON(1);
+ regs->verdict.code = NFT_BREAK;
+ }
+}
+
+static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
+ [NFTA_SOCKET_KEY] = { .type = NLA_U32 },
+ [NFTA_SOCKET_DREG] = { .type = NLA_U32 },
+};
+
+static int nft_socket_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_socket *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY])
+ return -EINVAL;
+
+ switch(ctx->family) {
+ case NFPROTO_IPV4:
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+ case NFPROTO_IPV6:
+#endif
+ case NFPROTO_INET:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
+ switch(priv->key) {
+ case NFT_SOCKET_TRANSPARENT:
+ len = sizeof(u8);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]);
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+}
+
+static int nft_socket_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_socket *priv = nft_expr_priv(expr);
+
+ if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
+ return -1;
+ if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
+ return -1;
+ return 0;
+}
+
+static struct nft_expr_type nft_socket_type;
+static const struct nft_expr_ops nft_socket_ops = {
+ .type = &nft_socket_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_socket)),
+ .eval = nft_socket_eval,
+ .init = nft_socket_init,
+ .dump = nft_socket_dump,
+};
+
+static struct nft_expr_type nft_socket_type __read_mostly = {
+ .name = "socket",
+ .ops = &nft_socket_ops,
+ .policy = nft_socket_policy,
+ .maxattr = NFTA_SOCKET_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_socket_module_init(void)
+{
+ return nft_register_expr(&nft_socket_type);
+}
+
+static void __exit nft_socket_module_exit(void)
+{
+ nft_unregister_expr(&nft_socket_type);
+}
+
+module_init(nft_socket_module_init);
+module_exit(nft_socket_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Máté Eckl");
+MODULE_DESCRIPTION("nf_tables socket match module");
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index 58aa9dd3c5b7..1d437875e15a 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -21,8 +21,8 @@
static unsigned int
netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
- struct nf_nat_range newrange;
+ const struct nf_nat_range2 *range = par->targinfo;
+ struct nf_nat_range2 newrange;
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
union nf_inet_addr new_addr, netmask;
@@ -56,7 +56,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
+ const struct nf_nat_range2 *range = par->targinfo;
if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
return -EINVAL;
@@ -75,7 +75,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
enum ip_conntrack_info ctinfo;
__be32 new_ip, netmask;
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
- struct nf_nat_range newrange;
+ struct nf_nat_range2 newrange;
WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING &&
xt_hooknum(par) != NF_INET_POST_ROUTING &&
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index c7f8958cea4a..1ed0cac585c4 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -13,7 +13,6 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_NFLOG.h>
#include <net/netfilter/nf_log.h>
-#include <net/netfilter/nfnetlink_log.h>
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG");
@@ -37,8 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (info->flags & XT_NFLOG_F_COPY_LEN)
li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
- nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb,
- xt_in(par), xt_out(par), &li, info->prefix);
+ nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par),
+ xt_out(par), &li, "%s", info->prefix);
+
return XT_CONTINUE;
}
@@ -50,7 +50,13 @@ static int nflog_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
if (info->prefix[sizeof(info->prefix) - 1] != '\0')
return -EINVAL;
- return 0;
+
+ return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+}
+
+static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_logger_put(par->family, NF_LOG_TYPE_ULOG);
}
static struct xt_target nflog_tg_reg __read_mostly = {
@@ -58,6 +64,7 @@ static struct xt_target nflog_tg_reg __read_mostly = {
.revision = 0,
.family = NFPROTO_UNSPEC,
.checkentry = nflog_tg_check,
+ .destroy = nflog_tg_destroy,
.target = nflog_tg,
.targetsize = sizeof(struct xt_nflog_info),
.me = THIS_MODULE,
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 98a4c6d4f1cb..5ce9461e979c 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -36,7 +36,7 @@ redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
+ const struct nf_nat_range2 *range = par->targinfo;
if (range->flags & NF_NAT_RANGE_MAP_IPS)
return -EINVAL;
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 8c89323c06af..58fce4e749a9 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -33,264 +33,9 @@
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#endif
+#include <net/netfilter/nf_tproxy.h>
#include <linux/netfilter/xt_TPROXY.h>
-enum nf_tproxy_lookup_t {
- NFT_LOOKUP_LISTENER,
- NFT_LOOKUP_ESTABLISHED,
-};
-
-static bool tproxy_sk_is_transparent(struct sock *sk)
-{
- switch (sk->sk_state) {
- case TCP_TIME_WAIT:
- if (inet_twsk(sk)->tw_transparent)
- return true;
- break;
- case TCP_NEW_SYN_RECV:
- if (inet_rsk(inet_reqsk(sk))->no_srccheck)
- return true;
- break;
- default:
- if (inet_sk(sk)->transparent)
- return true;
- }
-
- sock_gen_put(sk);
- return false;
-}
-
-static inline __be32
-tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
-{
- struct in_device *indev;
- __be32 laddr;
-
- if (user_laddr)
- return user_laddr;
-
- laddr = 0;
- indev = __in_dev_get_rcu(skb->dev);
- for_primary_ifa(indev) {
- laddr = ifa->ifa_local;
- break;
- } endfor_ifa(indev);
-
- return laddr ? laddr : daddr;
-}
-
-/*
- * This is used when the user wants to intercept a connection matching
- * an explicit iptables rule. In this case the sockets are assumed
- * matching in preference order:
- *
- * - match: if there's a fully established connection matching the
- * _packet_ tuple, it is returned, assuming the redirection
- * already took place and we process a packet belonging to an
- * established connection
- *
- * - match: if there's a listening socket matching the redirection
- * (e.g. on-port & on-ip of the connection), it is returned,
- * regardless if it was bound to 0.0.0.0 or an explicit
- * address. The reasoning is that if there's an explicit rule, it
- * does not really matter if the listener is bound to an interface
- * or to 0. The user already stated that he wants redirection
- * (since he added the rule).
- *
- * Please note that there's an overlap between what a TPROXY target
- * and a socket match will match. Normally if you have both rules the
- * "socket" match will be the first one, effectively all packets
- * belonging to established connections going through that one.
- */
-static inline struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
- const u8 protocol,
- const __be32 saddr, const __be32 daddr,
- const __be16 sport, const __be16 dport,
- const struct net_device *in,
- const enum nf_tproxy_lookup_t lookup_type)
-{
- struct sock *sk;
- struct tcphdr *tcph;
-
- switch (protocol) {
- case IPPROTO_TCP:
- switch (lookup_type) {
- case NFT_LOOKUP_LISTENER:
- tcph = hp;
- sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
- ip_hdrlen(skb) +
- __tcp_hdrlen(tcph),
- saddr, sport,
- daddr, dport,
- in->ifindex, 0);
-
- if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
- sk = NULL;
- /* NOTE: we return listeners even if bound to
- * 0.0.0.0, those are filtered out in
- * xt_socket, since xt_TPROXY needs 0 bound
- * listeners too
- */
- break;
- case NFT_LOOKUP_ESTABLISHED:
- sk = inet_lookup_established(net, &tcp_hashinfo,
- saddr, sport, daddr, dport,
- in->ifindex);
- break;
- default:
- BUG();
- }
- break;
- case IPPROTO_UDP:
- sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
- in->ifindex);
- if (sk) {
- int connected = (sk->sk_state == TCP_ESTABLISHED);
- int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
-
- /* NOTE: we return listeners even if bound to
- * 0.0.0.0, those are filtered out in
- * xt_socket, since xt_TPROXY needs 0 bound
- * listeners too
- */
- if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
- (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
- sock_put(sk);
- sk = NULL;
- }
- }
- break;
- default:
- WARN_ON(1);
- sk = NULL;
- }
-
- pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
- protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
-
- return sk;
-}
-
-#ifdef XT_TPROXY_HAVE_IPV6
-static inline struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
- const u8 protocol,
- const struct in6_addr *saddr, const struct in6_addr *daddr,
- const __be16 sport, const __be16 dport,
- const struct net_device *in,
- const enum nf_tproxy_lookup_t lookup_type)
-{
- struct sock *sk;
- struct tcphdr *tcph;
-
- switch (protocol) {
- case IPPROTO_TCP:
- switch (lookup_type) {
- case NFT_LOOKUP_LISTENER:
- tcph = hp;
- sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
- thoff + __tcp_hdrlen(tcph),
- saddr, sport,
- daddr, ntohs(dport),
- in->ifindex, 0);
-
- if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
- sk = NULL;
- /* NOTE: we return listeners even if bound to
- * 0.0.0.0, those are filtered out in
- * xt_socket, since xt_TPROXY needs 0 bound
- * listeners too
- */
- break;
- case NFT_LOOKUP_ESTABLISHED:
- sk = __inet6_lookup_established(net, &tcp_hashinfo,
- saddr, sport, daddr, ntohs(dport),
- in->ifindex, 0);
- break;
- default:
- BUG();
- }
- break;
- case IPPROTO_UDP:
- sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
- in->ifindex);
- if (sk) {
- int connected = (sk->sk_state == TCP_ESTABLISHED);
- int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
-
- /* NOTE: we return listeners even if bound to
- * 0.0.0.0, those are filtered out in
- * xt_socket, since xt_TPROXY needs 0 bound
- * listeners too
- */
- if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
- (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
- sock_put(sk);
- sk = NULL;
- }
- }
- break;
- default:
- WARN_ON(1);
- sk = NULL;
- }
-
- pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
- protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
-
- return sk;
-}
-#endif
-
-/**
- * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
- * @skb: The skb being processed.
- * @laddr: IPv4 address to redirect to or zero.
- * @lport: TCP port to redirect to or zero.
- * @sk: The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait4() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
- __be32 laddr, __be16 lport, struct sock *sk)
-{
- const struct iphdr *iph = ip_hdr(skb);
- struct tcphdr _hdr, *hp;
-
- hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- inet_twsk_put(inet_twsk(sk));
- return NULL;
- }
-
- if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
- /* SYN to a TIME_WAIT socket, we'd rather redirect it
- * to a listener socket if there's one */
- struct sock *sk2;
-
- sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
- iph->saddr, laddr ? laddr : iph->daddr,
- hp->source, lport ? lport : hp->dest,
- skb->dev, NFT_LOOKUP_LISTENER);
- if (sk2) {
- inet_twsk_deschedule_put(inet_twsk(sk));
- sk = sk2;
- }
- }
-
- return sk;
-}
-
/* assign a socket to the skb -- consumes sk */
static void
nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
@@ -319,26 +64,26 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
iph->saddr, iph->daddr,
hp->source, hp->dest,
- skb->dev, NFT_LOOKUP_ESTABLISHED);
+ skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
- laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+ laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr);
if (!lport)
lport = hp->dest;
/* UDP has no TCP_TIME_WAIT state, so we never enter here */
if (sk && sk->sk_state == TCP_TIME_WAIT)
/* reopening a TIME_WAIT connection needs special handling */
- sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
+ sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
else if (!sk)
/* no, there's no established connection, check if
* there's a listener on the redirected addr/port */
sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
iph->saddr, laddr,
hp->source, lport,
- skb->dev, NFT_LOOKUP_LISTENER);
+ skb->dev, NF_TPROXY_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
- if (sk && tproxy_sk_is_transparent(sk)) {
+ if (sk && nf_tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
@@ -377,87 +122,6 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
#ifdef XT_TPROXY_HAVE_IPV6
-static inline const struct in6_addr *
-tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
- const struct in6_addr *daddr)
-{
- struct inet6_dev *indev;
- struct inet6_ifaddr *ifa;
- struct in6_addr *laddr;
-
- if (!ipv6_addr_any(user_laddr))
- return user_laddr;
- laddr = NULL;
-
- indev = __in6_dev_get(skb->dev);
- if (indev) {
- read_lock_bh(&indev->lock);
- list_for_each_entry(ifa, &indev->addr_list, if_list) {
- if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
- continue;
-
- laddr = &ifa->addr;
- break;
- }
- read_unlock_bh(&indev->lock);
- }
-
- return laddr ? laddr : daddr;
-}
-
-/**
- * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections
- * @skb: The skb being processed.
- * @tproto: Transport protocol.
- * @thoff: Transport protocol header offset.
- * @par: Iptables target parameters.
- * @sk: The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait6() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
- const struct xt_action_param *par,
- struct sock *sk)
-{
- const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct tcphdr _hdr, *hp;
- const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
-
- hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- inet_twsk_put(inet_twsk(sk));
- return NULL;
- }
-
- if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
- /* SYN to a TIME_WAIT socket, we'd rather redirect it
- * to a listener socket if there's one */
- struct sock *sk2;
-
- sk2 = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
- &iph->saddr,
- tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
- hp->source,
- tgi->lport ? tgi->lport : hp->dest,
- skb->dev, NFT_LOOKUP_LISTENER);
- if (sk2) {
- inet_twsk_deschedule_put(inet_twsk(sk));
- sk = sk2;
- }
- }
-
- return sk;
-}
-
static unsigned int
tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -489,25 +153,31 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
&iph->saddr, &iph->daddr,
hp->source, hp->dest,
- xt_in(par), NFT_LOOKUP_ESTABLISHED);
+ xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED);
- laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+ laddr = nf_tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
lport = tgi->lport ? tgi->lport : hp->dest;
/* UDP has no TCP_TIME_WAIT state, so we never enter here */
- if (sk && sk->sk_state == TCP_TIME_WAIT)
+ if (sk && sk->sk_state == TCP_TIME_WAIT) {
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
/* reopening a TIME_WAIT connection needs special handling */
- sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+ sk = nf_tproxy_handle_time_wait6(skb, tproto, thoff,
+ xt_net(par),
+ &tgi->laddr.in6,
+ tgi->lport,
+ sk);
+ }
else if (!sk)
/* no there's no established connection, check if
* there's a listener on the redirected addr/port */
sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
tproto, &iph->saddr, laddr,
hp->source, lport,
- xt_in(par), NFT_LOOKUP_LISTENER);
+ xt_in(par), NF_TPROXY_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
- if (sk && tproxy_sk_is_transparent(sk)) {
+ if (sk && nf_tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index bdb689cdc829..8af9707f8789 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -37,11 +37,12 @@ static void xt_nat_destroy(const struct xt_tgdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-static void xt_nat_convert_range(struct nf_nat_range *dst,
+static void xt_nat_convert_range(struct nf_nat_range2 *dst,
const struct nf_nat_ipv4_range *src)
{
memset(&dst->min_addr, 0, sizeof(dst->min_addr));
memset(&dst->max_addr, 0, sizeof(dst->max_addr));
+ memset(&dst->base_proto, 0, sizeof(dst->base_proto));
dst->flags = src->flags;
dst->min_addr.ip = src->min_ip;
@@ -54,7 +55,7 @@ static unsigned int
xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -71,7 +72,7 @@ static unsigned int
xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -86,7 +87,8 @@ xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
static unsigned int
xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
+ const struct nf_nat_range *range_v1 = par->targinfo;
+ struct nf_nat_range2 range;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -95,13 +97,49 @@ xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
(ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY)));
- return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+ memcpy(&range, range_v1, sizeof(*range_v1));
+ memset(&range.base_proto, 0, sizeof(range.base_proto));
+
+ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
}
static unsigned int
xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
+ const struct nf_nat_range *range_v1 = par->targinfo;
+ struct nf_nat_range2 range;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ WARN_ON(!(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
+
+ memcpy(&range, range_v1, sizeof(*range_v1));
+ memset(&range.base_proto, 0, sizeof(range.base_proto));
+
+ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+static unsigned int
+xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range2 *range = par->targinfo;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ WARN_ON(!(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY)));
+
+ return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+}
+
+static unsigned int
+xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range2 *range = par->targinfo;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -163,6 +201,28 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_OUT),
.me = THIS_MODULE,
},
+ {
+ .name = "SNAT",
+ .revision = 2,
+ .checkentry = xt_nat_checkentry,
+ .destroy = xt_nat_destroy,
+ .target = xt_snat_target_v2,
+ .targetsize = sizeof(struct nf_nat_range2),
+ .table = "nat",
+ .hooks = (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "DNAT",
+ .revision = 2,
+ .target = xt_dnat_target_v2,
+ .targetsize = sizeof(struct nf_nat_range2),
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
};
static int __init xt_nat_init(void)
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index a34f314a8c23..9cfef73b4107 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -37,21 +37,6 @@
#include <net/netfilter/nf_log.h>
#include <linux/netfilter/xt_osf.h>
-struct xt_osf_finger {
- struct rcu_head rcu_head;
- struct list_head finger_entry;
- struct xt_osf_user_finger finger;
-};
-
-enum osf_fmatch_states {
- /* Packet does not match the fingerprint */
- FMATCH_WRONG = 0,
- /* Packet matches the fingerprint */
- FMATCH_OK,
- /* Options do not match the fingerprint, but header does */
- FMATCH_OPT_WRONG,
-};
-
/*
* Indexed by dont-fragment bit.
* It is the only constant value in the fingerprint.
@@ -164,200 +149,17 @@ static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
.cb = xt_osf_nfnetlink_callbacks,
};
-static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info,
- unsigned char f_ttl)
-{
- const struct iphdr *ip = ip_hdr(skb);
-
- if (info->flags & XT_OSF_TTL) {
- if (info->ttl == XT_OSF_TTL_TRUE)
- return ip->ttl == f_ttl;
- if (info->ttl == XT_OSF_TTL_NOCHECK)
- return 1;
- else if (ip->ttl <= f_ttl)
- return 1;
- else {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
- int ret = 0;
-
- for_ifa(in_dev) {
- if (inet_ifa_match(ip->saddr, ifa)) {
- ret = (ip->ttl == f_ttl);
- break;
- }
- }
- endfor_ifa(in_dev);
-
- return ret;
- }
- }
-
- return ip->ttl == f_ttl;
-}
-
static bool
xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
{
const struct xt_osf_info *info = p->matchinfo;
- const struct iphdr *ip = ip_hdr(skb);
- const struct tcphdr *tcp;
- struct tcphdr _tcph;
- int fmatch = FMATCH_WRONG, fcount = 0;
- unsigned int optsize = 0, check_WSS = 0;
- u16 window, totlen, mss = 0;
- bool df;
- const unsigned char *optp = NULL, *_optp = NULL;
- unsigned char opts[MAX_IPOPTLEN];
- const struct xt_osf_finger *kf;
- const struct xt_osf_user_finger *f;
struct net *net = xt_net(p);
if (!info)
return false;
- tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
- if (!tcp)
- return false;
-
- if (!tcp->syn)
- return false;
-
- totlen = ntohs(ip->tot_len);
- df = ntohs(ip->frag_off) & IP_DF;
- window = ntohs(tcp->window);
-
- if (tcp->doff * 4 > sizeof(struct tcphdr)) {
- optsize = tcp->doff * 4 - sizeof(struct tcphdr);
-
- _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
- sizeof(struct tcphdr), optsize, opts);
- }
-
- list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) {
- int foptsize, optnum;
-
- f = &kf->finger;
-
- if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre))
- continue;
-
- optp = _optp;
- fmatch = FMATCH_WRONG;
-
- if (totlen != f->ss || !xt_osf_ttl(skb, info, f->ttl))
- continue;
-
- /*
- * Should not happen if userspace parser was written correctly.
- */
- if (f->wss.wc >= OSF_WSS_MAX)
- continue;
-
- /* Check options */
-
- foptsize = 0;
- for (optnum = 0; optnum < f->opt_num; ++optnum)
- foptsize += f->opt[optnum].length;
-
- if (foptsize > MAX_IPOPTLEN ||
- optsize > MAX_IPOPTLEN ||
- optsize != foptsize)
- continue;
-
- check_WSS = f->wss.wc;
-
- for (optnum = 0; optnum < f->opt_num; ++optnum) {
- if (f->opt[optnum].kind == (*optp)) {
- __u32 len = f->opt[optnum].length;
- const __u8 *optend = optp + len;
-
- fmatch = FMATCH_OK;
-
- switch (*optp) {
- case OSFOPT_MSS:
- mss = optp[3];
- mss <<= 8;
- mss |= optp[2];
-
- mss = ntohs((__force __be16)mss);
- break;
- case OSFOPT_TS:
- break;
- }
-
- optp = optend;
- } else
- fmatch = FMATCH_OPT_WRONG;
-
- if (fmatch != FMATCH_OK)
- break;
- }
-
- if (fmatch != FMATCH_OPT_WRONG) {
- fmatch = FMATCH_WRONG;
-
- switch (check_WSS) {
- case OSF_WSS_PLAIN:
- if (f->wss.val == 0 || window == f->wss.val)
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MSS:
- /*
- * Some smart modems decrease mangle MSS to
- * SMART_MSS_2, so we check standard, decreased
- * and the one provided in the fingerprint MSS
- * values.
- */
-#define SMART_MSS_1 1460
-#define SMART_MSS_2 1448
- if (window == f->wss.val * mss ||
- window == f->wss.val * SMART_MSS_1 ||
- window == f->wss.val * SMART_MSS_2)
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MTU:
- if (window == f->wss.val * (mss + 40) ||
- window == f->wss.val * (SMART_MSS_1 + 40) ||
- window == f->wss.val * (SMART_MSS_2 + 40))
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MODULO:
- if ((window % f->wss.val) == 0)
- fmatch = FMATCH_OK;
- break;
- }
- }
-
- if (fmatch != FMATCH_OK)
- continue;
-
- fcount++;
-
- if (info->flags & XT_OSF_LOG)
- nf_log_packet(net, xt_family(p), xt_hooknum(p), skb,
- xt_in(p), xt_out(p), NULL,
- "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
- f->genre, f->version, f->subtype,
- &ip->saddr, ntohs(tcp->source),
- &ip->daddr, ntohs(tcp->dest),
- f->ttl - ip->ttl);
-
- if ((info->flags & XT_OSF_LOG) &&
- info->loglevel == XT_OSF_LOGLEVEL_FIRST)
- break;
- }
-
- if (!fcount && (info->flags & XT_OSF_LOG))
- nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p),
- xt_out(p), NULL,
- "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
- &ip->saddr, ntohs(tcp->source),
- &ip->daddr, ntohs(tcp->dest));
-
- if (fcount)
- fmatch = FMATCH_OK;
-
- return fmatch == FMATCH_OK;
+ return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
+ xt_out(p), info, net, xt_osf_fingers);
}
static struct xt_match xt_osf_match = {
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 2ac7f674d19b..5c0779c4fa3c 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -73,7 +73,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
* if XT_SOCKET_TRANSPARENT is used
*/
if (info->flags & XT_SOCKET_TRANSPARENT)
- transparent = nf_sk_is_transparent(sk);
+ transparent = inet_sk_transparent(sk);
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent && sk_fullsock(sk))
@@ -130,7 +130,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
* if XT_SOCKET_TRANSPARENT is used
*/
if (info->flags & XT_SOCKET_TRANSPARENT)
- transparent = nf_sk_is_transparent(sk);
+ transparent = inet_sk_transparent(sk);
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent && sk_fullsock(sk))