summaryrefslogtreecommitdiff
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/Kconfig53
-rw-r--r--net/netfilter/Makefile17
-rw-r--r--net/netfilter/core.c69
-rw-r--r--net/netfilter/ipset/Kconfig2
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h18
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c7
-rw-r--r--net/netfilter/ipset/ip_set_core.c152
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h30
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c18
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c1
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c51
-rw-r--r--net/netfilter/ipvs/Kconfig29
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c108
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c33
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c124
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c25
-rw-r--r--net/netfilter/ipvs/ip_vs_fo.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c10
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c16
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c16
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_ovf.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c7
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c42
-rw-r--r--net/netfilter/ipvs/ip_vs_twos.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c144
-rw-r--r--net/netfilter/nf_bpf_link.c332
-rw-r--r--net/netfilter/nf_conncount.c242
-rw-r--r--net/netfilter/nf_conntrack_amanda.c2
-rw-r--r--net/netfilter/nf_conntrack_bpf.c106
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c3
-rw-r--r--net/netfilter/nf_conntrack_core.c372
-rw-r--r--net/netfilter/nf_conntrack_ecache.c27
-rw-r--r--net/netfilter/nf_conntrack_expect.c20
-rw-r--r--net/netfilter/nf_conntrack_extend.c4
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c4
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c4
-rw-r--r--net/netfilter/nf_conntrack_helper.c113
-rw-r--r--net/netfilter/nf_conntrack_labels.c17
-rw-r--r--net/netfilter/nf_conntrack_netlink.c248
-rw-r--r--net/netfilter/nf_conntrack_ovs.c185
-rw-r--r--net/netfilter/nf_conntrack_proto.c27
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c778
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c11
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c4
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c117
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c63
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c10
-rw-r--r--net/netfilter/nf_conntrack_sip.c6
-rw-r--r--net/netfilter/nf_conntrack_standalone.c276
-rw-r--r--net/netfilter/nf_dup_netdev.c22
-rw-r--r--net/netfilter/nf_flow_table_bpf.c121
-rw-r--r--net/netfilter/nf_flow_table_core.c269
-rw-r--r--net/netfilter/nf_flow_table_inet.c8
-rw-r--r--net/netfilter/nf_flow_table_ip.c542
-rw-r--r--net/netfilter/nf_flow_table_offload.c46
-rw-r--r--net/netfilter/nf_flow_table_path.c330
-rw-r--r--net/netfilter/nf_flow_table_xdp.c147
-rw-r--r--net/netfilter/nf_hooks_lwtunnel.c72
-rw-r--r--net/netfilter/nf_internals.h6
-rw-r--r--net/netfilter/nf_log.c56
-rw-r--r--net/netfilter/nf_log_syslog.c23
-rw-r--r--net/netfilter/nf_nat_bpf.c16
-rw-r--r--net/netfilter/nf_nat_core.c249
-rw-r--r--net/netfilter/nf_nat_ovs.c3
-rw-r--r--net/netfilter/nf_nat_proto.c114
-rw-r--r--net/netfilter/nf_nat_redirect.c98
-rw-r--r--net/netfilter/nf_queue.c112
-rw-r--r--net/netfilter/nf_synproxy_core.c10
-rw-r--r--net/netfilter/nf_tables_api.c4763
-rw-r--r--net/netfilter/nf_tables_core.c107
-rw-r--r--net/netfilter/nf_tables_offload.c104
-rw-r--r--net/netfilter/nf_tables_trace.c127
-rw-r--r--net/netfilter/nfnetlink.c40
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c8
-rw-r--r--net/netfilter/nfnetlink_hook.c153
-rw-r--r--net/netfilter/nfnetlink_log.c56
-rw-r--r--net/netfilter/nfnetlink_osf.c10
-rw-r--r--net/netfilter/nfnetlink_queue.c260
-rw-r--r--net/netfilter/nft_bitwise.c174
-rw-r--r--net/netfilter/nft_byteorder.c29
-rw-r--r--net/netfilter/nft_chain_filter.c130
-rw-r--r--net/netfilter/nft_chain_nat.c1
-rw-r--r--net/netfilter/nft_cmp.c8
-rw-r--r--net/netfilter/nft_compat.c69
-rw-r--r--net/netfilter/nft_connlimit.c60
-rw-r--r--net/netfilter/nft_counter.c95
-rw-r--r--net/netfilter/nft_ct.c111
-rw-r--r--net/netfilter/nft_ct_fast.c62
-rw-r--r--net/netfilter/nft_dup_netdev.c2
-rw-r--r--net/netfilter/nft_dynset.c71
-rw-r--r--net/netfilter/nft_exthdr.c176
-rw-r--r--net/netfilter/nft_fib.c35
-rw-r--r--net/netfilter/nft_flow_offload.c290
-rw-r--r--net/netfilter/nft_fwd_netdev.c12
-rw-r--r--net/netfilter/nft_hash.c7
-rw-r--r--net/netfilter/nft_immediate.c102
-rw-r--r--net/netfilter/nft_inner.c72
-rw-r--r--net/netfilter/nft_last.c8
-rw-r--r--net/netfilter/nft_limit.c56
-rw-r--r--net/netfilter/nft_log.c2
-rw-r--r--net/netfilter/nft_lookup.c152
-rw-r--r--net/netfilter/nft_masq.c92
-rw-r--r--net/netfilter/nft_meta.c23
-rw-r--r--net/netfilter/nft_nat.c26
-rw-r--r--net/netfilter/nft_numgen.c2
-rw-r--r--net/netfilter/nft_objref.c68
-rw-r--r--net/netfilter/nft_osf.c20
-rw-r--r--net/netfilter/nft_payload.c130
-rw-r--r--net/netfilter/nft_queue.c5
-rw-r--r--net/netfilter/nft_quota.c30
-rw-r--r--net/netfilter/nft_range.c4
-rw-r--r--net/netfilter/nft_redir.c103
-rw-r--r--net/netfilter/nft_reject.c5
-rw-r--r--net/netfilter/nft_reject_inet.c3
-rw-r--r--net/netfilter/nft_reject_netdev.c3
-rw-r--r--net/netfilter/nft_rt.c14
-rw-r--r--net/netfilter/nft_set_bitmap.c84
-rw-r--r--net/netfilter/nft_set_hash.c399
-rw-r--r--net/netfilter/nft_set_pipapo.c952
-rw-r--r--net/netfilter/nft_set_pipapo.h88
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c236
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h4
-rw-r--r--net/netfilter/nft_set_rbtree.c457
-rw-r--r--net/netfilter/nft_socket.c62
-rw-r--r--net/netfilter/nft_synproxy.c10
-rw-r--r--net/netfilter/nft_tproxy.c14
-rw-r--r--net/netfilter/nft_tunnel.c73
-rw-r--r--net/netfilter/nft_xfrm.c15
-rw-r--r--net/netfilter/utils.c79
-rw-r--r--net/netfilter/x_tables.c26
-rw-r--r--net/netfilter/xt_CHECKSUM.c33
-rw-r--r--net/netfilter/xt_CLASSIFY.c16
-rw-r--r--net/netfilter/xt_CONNSECMARK.c36
-rw-r--r--net/netfilter/xt_CT.c106
-rw-r--r--net/netfilter/xt_IDLETIMER.c131
-rw-r--r--net/netfilter/xt_LED.c51
-rw-r--r--net/netfilter/xt_NFLOG.c36
-rw-r--r--net/netfilter/xt_RATEEST.c39
-rw-r--r--net/netfilter/xt_REDIRECT.c10
-rw-r--r--net/netfilter/xt_SECMARK.c27
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c4
-rw-r--r--net/netfilter/xt_TRACE.c36
-rw-r--r--net/netfilter/xt_addrtype.c15
-rw-r--r--net/netfilter/xt_cgroup.c26
-rw-r--r--net/netfilter/xt_cluster.c33
-rw-r--r--net/netfilter/xt_connbytes.c4
-rw-r--r--net/netfilter/xt_connlimit.c68
-rw-r--r--net/netfilter/xt_connmark.c28
-rw-r--r--net/netfilter/xt_hashlimit.c18
-rw-r--r--net/netfilter/xt_length.c5
-rw-r--r--net/netfilter/xt_mark.c42
-rw-r--r--net/netfilter/xt_nfacct.c4
-rw-r--r--net/netfilter/xt_osf.c1
-rw-r--r--net/netfilter/xt_owner.c16
-rw-r--r--net/netfilter/xt_physdev.c2
-rw-r--r--net/netfilter/xt_recent.c10
-rw-r--r--net/netfilter/xt_repldata.h2
-rw-r--r--net/netfilter/xt_sctp.c2
-rw-r--r--net/netfilter/xt_socket.c4
-rw-r--r--net/netfilter/xt_tcpudp.c110
-rw-r--r--net/netfilter/xt_u32.c21
179 files changed, 11262 insertions, 6318 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f71b41c7ce2f..6cdc994fdc8a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -30,6 +30,9 @@ config NETFILTER_FAMILY_BRIDGE
config NETFILTER_FAMILY_ARP
bool
+config NETFILTER_BPF_LINK
+ def_bool BPF_SYSCALL
+
config NETFILTER_NETLINK_HOOK
tristate "Netfilter base hook dump support"
depends on NETFILTER_ADVANCED
@@ -189,15 +192,8 @@ config NF_CONNTRACK_LABELS
to connection tracking entries. It can be used with xtables connlabel
match and the nftables ct expression.
-config NF_CT_PROTO_DCCP
- bool 'DCCP protocol connection tracking support'
- depends on NETFILTER_ADVANCED
- default y
- help
- With this option enabled, the layer 3 independent connection
- tracking code will be able to do state tracking on DCCP connections.
-
- If unsure, say Y.
+config NF_CONNTRACK_OVS
+ bool
config NF_CT_PROTO_GRE
bool
@@ -206,7 +202,7 @@ config NF_CT_PROTO_SCTP
bool 'SCTP protocol connection tracking support'
depends on NETFILTER_ADVANCED
default y
- select LIBCRC32C
+ select NET_CRC32C
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on SCTP connections.
@@ -469,7 +465,7 @@ endif # NF_CONNTRACK
config NF_TABLES
select NETFILTER_NETLINK
- select LIBCRC32C
+ select NET_CRC32C
tristate "Netfilter nf_tables support"
help
nftables is the new packet classification framework that intends to
@@ -510,6 +506,12 @@ config NFT_CT
This option adds the "ct" expression that you can use to match
connection tracking information such as the flow state.
+config NFT_EXTHDR_DCCP
+ bool "Netfilter nf_tables exthdr DCCP support (DEPRECATED)"
+ default n
+ help
+ This option adds support for matching on DCCP extension headers.
+
config NFT_FLOW_OFFLOAD
depends on NF_CONNTRACK && NF_FLOW_TABLE
tristate "Netfilter nf_tables hardware flow offload module"
@@ -750,13 +752,22 @@ if NETFILTER_XTABLES
config NETFILTER_XTABLES_COMPAT
bool "Netfilter Xtables 32bit support"
depends on COMPAT
- default y
help
This option provides a translation layer to run 32bit arp,ip(6),ebtables
binaries on 64bit kernels.
If unsure, say N.
+config NETFILTER_XTABLES_LEGACY
+ bool "Netfilter legacy tables support"
+ depends on !PREEMPT_RT
+ help
+ Say Y here if you still require support for legacy tables. This is
+ required by the legacy tools (iptables-legacy) and is not needed if
+ you use iptables over nftables (iptables-nft).
+ Legacy support is not limited to IP, it also includes EBTABLES and
+ ARPTABLES.
+
comment "Xtables combined modules"
config NETFILTER_XT_MARK
@@ -813,7 +824,7 @@ config NETFILTER_XT_TARGET_AUDIT
config NETFILTER_XT_TARGET_CHECKSUM
tristate "CHECKSUM target support"
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This option adds a `CHECKSUM' target, which can be used in the iptables mangle
@@ -864,7 +875,7 @@ config NETFILTER_XT_TARGET_CONNSECMARK
config NETFILTER_XT_TARGET_CT
tristate '"CT" target support'
depends on NF_CONNTRACK
- depends on IP_NF_RAW || IP6_NF_RAW
+ depends on IP_NF_RAW || IP6_NF_RAW || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This options adds a `CT' target, which allows to specify initial
@@ -875,7 +886,7 @@ config NETFILTER_XT_TARGET_CT
config NETFILTER_XT_TARGET_DSCP
tristate '"DSCP" and "TOS" target support'
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This option adds a `DSCP' target, which allows you to manipulate
@@ -891,7 +902,7 @@ config NETFILTER_XT_TARGET_DSCP
config NETFILTER_XT_TARGET_HL
tristate '"HL" hoplimit target support'
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This option adds the "HL" (for IPv6) and "TTL" (for IPv4)
@@ -1075,7 +1086,7 @@ config NETFILTER_XT_TARGET_TPROXY
depends on NETFILTER_ADVANCED
depends on IPV6 || IPV6=n
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
- depends on IP_NF_MANGLE
+ depends on IP_NF_MANGLE || NFT_COMPAT
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
select NF_TPROXY_IPV4
@@ -1142,7 +1153,7 @@ config NETFILTER_XT_TARGET_TCPMSS
config NETFILTER_XT_TARGET_TCPOPTSTRIP
tristate '"TCPOPTSTRIP" target support'
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This option adds a "TCPOPTSTRIP" target, which allows you to strip
@@ -1175,7 +1186,7 @@ config NETFILTER_XT_MATCH_CGROUP
tristate '"control group" match support'
depends on NETFILTER_ADVANCED
depends on CGROUPS
- select CGROUP_NET_CLASSID
+ select SOCK_CGROUP_DATA
help
Socket/process control group matching allows you to match locally
generated packets based on which net_cls control group processes
@@ -1273,9 +1284,9 @@ config NETFILTER_XT_MATCH_CPU
To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_MATCH_DCCP
- tristate '"dccp" protocol match support'
+ tristate '"dccp" protocol match support (DEPRECATED)'
depends on NETFILTER_ADVANCED
- default IP_DCCP
+ default n
help
With this option enabled, you will be able to use the iptables
`dccp' match in order to match on DCCP source/destination ports
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3754eb06fb41..6bfc250e474f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -11,7 +11,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
-nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
ifeq ($(CONFIG_NF_CONNTRACK),m)
@@ -21,6 +21,7 @@ nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
endif
obj-$(CONFIG_NETFILTER) = netfilter.o
+obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o
obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
@@ -98,6 +99,12 @@ nf_tables-objs += nft_set_pipapo_avx2.o
endif
endif
+ifdef CONFIG_NFT_CT
+ifdef CONFIG_MITIGATION_RETPOLINE
+nf_tables-objs += nft_ct_fast.o
+endif
+endif
+
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
@@ -134,8 +141,14 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
# flow table infrastructure
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \
- nf_flow_table_offload.o
+ nf_flow_table_path.o \
+ nf_flow_table_offload.o nf_flow_table_xdp.o
nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o
+ifeq ($(CONFIG_NF_FLOW_TABLE),m)
+nf_flow_table-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_flow_table_bpf.o
+else ifeq ($(CONFIG_NF_FLOW_TABLE),y)
+nf_flow_table-$(CONFIG_DEBUG_INFO_BTF) += nf_flow_table_bpf.o
+endif
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 5a6705a0e4ec..11a702065bab 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -31,9 +31,6 @@
const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);
-DEFINE_PER_CPU(bool, nf_skb_duplicated);
-EXPORT_SYMBOL_GPL(nf_skb_duplicated);
-
#ifdef CONFIG_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
@@ -119,6 +116,18 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
for (i = 0; i < old_entries; i++) {
if (orig_ops[i] != &dummy_ops)
alloc_entries++;
+
+ /* Restrict BPF hook type to force a unique priority, not
+ * shared at attach time.
+ *
+ * This is mainly to avoid ordering issues between two
+ * different bpf programs, this doesn't prevent a normal
+ * hook at same priority as a bpf one (we don't want to
+ * prevent defrag, conntrack, iptables etc from attaching).
+ */
+ if (reg->priority == orig_ops[i]->priority &&
+ reg->hook_ops_type == NF_HOOK_OP_BPF)
+ return ERR_PTR(-EBUSY);
}
}
@@ -627,10 +636,10 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
if (ret == 1)
continue;
return ret;
+ case NF_STOLEN:
+ return NF_DROP_GETERR(verdict);
default:
- /* Implicit handling for NF_STOLEN, as well as any other
- * non conventional verdicts.
- */
+ WARN_ON_ONCE(1);
return 0;
}
}
@@ -643,11 +652,9 @@ void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
const struct nf_hook_entries *e)
{
struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
int ret;
- INIT_LIST_HEAD(&sublist);
-
list_for_each_entry_safe(skb, next, head, list) {
skb_list_del_init(skb);
ret = nf_hook_slow(skb, state, e, 0);
@@ -668,7 +675,16 @@ EXPORT_SYMBOL_GPL(nfnl_ct_hook);
const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_hook);
+const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_defrag_v4_hook);
+
+const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_defrag_v6_hook);
+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+u8 nf_ctnetlink_has_listener;
+EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener);
+
const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_hook);
@@ -696,12 +712,30 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
- BUG_ON(ct_hook == NULL);
- ct_hook->destroy(nfct);
+ if (ct_hook)
+ ct_hook->destroy(nfct);
rcu_read_unlock();
+
+ WARN_ON(!ct_hook);
}
EXPORT_SYMBOL(nf_conntrack_destroy);
+void nf_ct_set_closing(struct nf_conntrack *nfct)
+{
+ const struct nf_ct_hook *ct_hook;
+
+ if (!nfct)
+ return;
+
+ rcu_read_lock();
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook)
+ ct_hook->set_closing(nfct);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_ct_set_closing);
+
bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
const struct sk_buff *skb)
{
@@ -776,12 +810,21 @@ int __init netfilter_init(void)
if (ret < 0)
goto err;
+#ifdef CONFIG_LWTUNNEL
+ ret = netfilter_lwtunnel_init();
+ if (ret < 0)
+ goto err_lwtunnel_pernet;
+#endif
ret = netfilter_log_init();
if (ret < 0)
- goto err_pernet;
+ goto err_log_pernet;
return 0;
-err_pernet:
+err_log_pernet:
+#ifdef CONFIG_LWTUNNEL
+ netfilter_lwtunnel_fini();
+err_lwtunnel_pernet:
+#endif
unregister_pernet_subsys(&netfilter_net_ops);
err:
return ret;
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
index 3c273483df23..b1ea054bb82c 100644
--- a/net/netfilter/ipset/Kconfig
+++ b/net/netfilter/ipset/Kconfig
@@ -30,7 +30,7 @@ config IP_SET_BITMAP_IP
depends on IP_SET
help
This option adds the bitmap:ip set type support, by which one
- can store IPv4 addresses (or network addresse) from a range.
+ can store IPv4 addresses (or network addresses) from a range.
To compile it as a module, choose M here. If unsure, say N.
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 26ab0e9612d8..798c7993635e 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -4,6 +4,8 @@
#ifndef __IP_SET_BITMAP_IP_GEN_H
#define __IP_SET_BITMAP_IP_GEN_H
+#include <linux/rcupdate_wait.h>
+
#define mtype_do_test IPSET_TOKEN(MTYPE, _do_test)
#define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test)
#define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled)
@@ -28,6 +30,7 @@
#define mtype_del IPSET_TOKEN(MTYPE, _del)
#define mtype_list IPSET_TOKEN(MTYPE, _list)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype MTYPE
#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
@@ -57,9 +60,6 @@ mtype_destroy(struct ip_set *set)
{
struct mtype *map = set->data;
- if (SET_WITH_TIMEOUT(set))
- del_timer_sync(&map->gc);
-
if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set);
ip_set_free(map->members);
@@ -264,7 +264,7 @@ out:
static void
mtype_gc(struct timer_list *t)
{
- struct mtype *map = from_timer(map, t, gc);
+ struct mtype *map = timer_container_of(map, t, gc);
struct ip_set *set = map->set;
void *x;
u32 id;
@@ -288,6 +288,15 @@ mtype_gc(struct timer_list *t)
add_timer(&map->gc);
}
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ timer_delete_sync(&map->gc);
+}
+
static const struct ip_set_type_variant mtype = {
.kadt = mtype_kadt,
.uadt = mtype_uadt,
@@ -301,6 +310,7 @@ static const struct ip_set_type_variant mtype = {
.head = mtype_head,
.list = mtype_list,
.same_set = mtype_same_set,
+ .cancel_gc = mtype_cancel_gc,
};
#endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index e4fa00abde6a..5988b9bb9029 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -163,11 +163,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
return ret;
- if (ip > ip_to) {
+ if (ip > ip_to)
swap(ip, ip_to);
- if (ip < map->first_ip)
- return -IPSET_ERR_BITMAP_RANGE;
- }
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
@@ -178,7 +175,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
ip_to = ip;
}
- if (ip_to > map->last_ip)
+ if (ip < map->first_ip || ip_to > map->last_ip)
return -IPSET_ERR_BITMAP_RANGE;
for (; !before(ip_to, ip); ip += map->hosts) {
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 46ebee9400da..cc20e6d56807 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -53,14 +53,17 @@ MODULE_DESCRIPTION("core IP set support");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
/* When the nfnl mutex or ip_set_ref_lock is held: */
-#define ip_set_dereference(p) \
- rcu_dereference_protected(p, \
+#define ip_set_dereference(inst) \
+ rcu_dereference_protected((inst)->ip_set_list, \
lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
- lockdep_is_held(&ip_set_ref_lock))
+ lockdep_is_held(&ip_set_ref_lock) || \
+ (inst)->is_deleted)
#define ip_set(inst, id) \
- ip_set_dereference((inst)->ip_set_list)[id]
+ ip_set_dereference(inst)[id]
#define ip_set_ref_netlink(inst,id) \
rcu_dereference_raw((inst)->ip_set_list)[id]
+#define ip_set_dereference_nfnl(p) \
+ rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
/* The set types are implemented in modules and registered set types
* can be found in ip_set_type_list. Adding/deleting types is
@@ -101,14 +104,19 @@ find_set_type(const char *name, u8 family, u8 revision)
static bool
load_settype(const char *name)
{
+ if (!try_module_get(THIS_MODULE))
+ return false;
+
nfnl_unlock(NFNL_SUBSYS_IPSET);
pr_debug("try to load ip_set_%s\n", name);
if (request_module("ip_set_%s", name) < 0) {
pr_warn("Can't find ip_set type %s\n", name);
nfnl_lock(NFNL_SUBSYS_IPSET);
+ module_put(THIS_MODULE);
return false;
}
nfnl_lock(NFNL_SUBSYS_IPSET);
+ module_put(THIS_MODULE);
return true;
}
@@ -683,6 +691,14 @@ __ip_set_put(struct ip_set *set)
* a separate reference counter
*/
static void
+__ip_set_get_netlink(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ set->ref_netlink++;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+static void
__ip_set_put_netlink(struct ip_set *set)
{
write_lock_bh(&ip_set_ref_lock);
@@ -700,15 +716,10 @@ __ip_set_put_netlink(struct ip_set *set)
static struct ip_set *
ip_set_rcu_get(struct net *net, ip_set_id_t index)
{
- struct ip_set *set;
struct ip_set_net *inst = ip_set_pernet(net);
- rcu_read_lock();
- /* ip_set_list itself needs to be protected */
- set = rcu_dereference(inst->ip_set_list)[index];
- rcu_read_unlock();
-
- return set;
+ /* ip_set_list and the set pointer need to be protected */
+ return ip_set_dereference_nfnl(inst->ip_set_list)[index];
}
static inline void
@@ -739,9 +750,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return 0;
- rcu_read_lock_bh();
ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
- rcu_read_unlock_bh();
if (ret == -EAGAIN) {
/* Type requests element to be completed */
@@ -874,7 +883,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name)
BUG_ON(!set);
read_lock_bh(&ip_set_ref_lock);
- strncpy(name, set->name, IPSET_MAXNAMELEN);
+ strscpy_pad(name, set->name, IPSET_MAXNAMELEN);
read_unlock_bh(&ip_set_ref_lock);
}
EXPORT_SYMBOL_GPL(ip_set_name_byindex);
@@ -1130,7 +1139,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info,
if (!list)
goto cleanup;
/* nfnl mutex is held, both lists are valid */
- tmp = ip_set_dereference(inst->ip_set_list);
+ tmp = ip_set_dereference(inst);
memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
rcu_assign_pointer(inst->ip_set_list, list);
/* Make sure all current packets have passed through */
@@ -1151,6 +1160,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info,
return ret;
cleanup:
+ set->variant->cancel_gc(set);
set->variant->destroy(set);
put_out:
module_put(set->type->me);
@@ -1168,17 +1178,52 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
.len = IPSET_MAXNAMELEN - 1 },
};
+/* In order to return quickly when destroying a single set, it is split
+ * into two stages:
+ * - Cancel garbage collector
+ * - Destroy the set itself via call_rcu()
+ */
+
static void
-ip_set_destroy_set(struct ip_set *set)
+ip_set_destroy_set_rcu(struct rcu_head *head)
{
- pr_debug("set: %s\n", set->name);
+ struct ip_set *set = container_of(head, struct ip_set, rcu);
- /* Must call it without holding any lock */
set->variant->destroy(set);
module_put(set->type->me);
kfree(set);
}
+static void
+_destroy_all_sets(struct ip_set_net *inst)
+{
+ struct ip_set *set;
+ ip_set_id_t i;
+ bool need_wait = false;
+
+ /* First cancel gc's: set:list sets are flushed as well */
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set) {
+ set->variant->cancel_gc(set);
+ if (set->type->features & IPSET_TYPE_NAME)
+ need_wait = true;
+ }
+ }
+ /* Must wait for flush to be really finished */
+ if (need_wait)
+ rcu_barrier();
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set) {
+ ip_set(inst, i) = NULL;
+ set->variant->destroy(set);
+ module_put(set->type->me);
+ kfree(set);
+ }
+ }
+}
+
static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const attr[])
{
@@ -1190,21 +1235,18 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
- /* Must wait for flush to be really finished in list:set */
- rcu_barrier();
-
/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
* External systems (i.e. xt_set) must call
- * ip_set_put|get_nfnl_* functions, that way we
+ * ip_set_nfnl_get_* functions, that way we
* can safely check references here.
*
* list:set timer can only decrement the reference
* counter, so if it's already zero, we can proceed
* without holding the lock.
*/
- read_lock_bh(&ip_set_ref_lock);
if (!attr[IPSET_ATTR_SETNAME]) {
+ read_lock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
if (s && (s->ref || s->ref_netlink)) {
@@ -1214,17 +1256,14 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
}
inst->is_destroyed = true;
read_unlock_bh(&ip_set_ref_lock);
- for (i = 0; i < inst->ip_set_max; i++) {
- s = ip_set(inst, i);
- if (s) {
- ip_set(inst, i) = NULL;
- ip_set_destroy_set(s);
- }
- }
+ _destroy_all_sets(inst);
/* Modified by ip_set_destroy() only, which is serialized */
inst->is_destroyed = false;
} else {
u32 flags = flag_exist(info->nlh);
+ u16 features = 0;
+
+ read_lock_bh(&ip_set_ref_lock);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
if (!s) {
@@ -1235,10 +1274,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
ret = -IPSET_ERR_BUSY;
goto out;
}
+ features = s->type->features;
ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock);
-
- ip_set_destroy_set(s);
+ /* Must cancel garbage collectors */
+ s->variant->cancel_gc(s);
+ if (features & IPSET_TYPE_NAME) {
+ /* Must wait for flush to be really finished */
+ rcu_barrier();
+ }
+ call_rcu(&s->rcu, ip_set_destroy_set_rcu);
}
return 0;
out:
@@ -1328,7 +1373,7 @@ static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info,
goto out;
}
}
- strncpy(set->name, name2, IPSET_MAXNAMELEN);
+ strscpy_pad(set->name, name2, IPSET_MAXNAMELEN);
out:
write_unlock_bh(&ip_set_ref_lock);
@@ -1382,9 +1427,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
return -EBUSY;
}
- strncpy(from_name, from->name, IPSET_MAXNAMELEN);
- strncpy(from->name, to->name, IPSET_MAXNAMELEN);
- strncpy(to->name, from_name, IPSET_MAXNAMELEN);
+ strscpy_pad(from_name, from->name, IPSET_MAXNAMELEN);
+ strscpy_pad(from->name, to->name, IPSET_MAXNAMELEN);
+ strscpy_pad(to->name, from_name, IPSET_MAXNAMELEN);
swap(from->ref, to->ref);
ip_set(inst, from_id) = to;
@@ -1694,6 +1739,14 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb,
bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
do {
+ if (retried) {
+ __ip_set_get_netlink(set);
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ cond_resched();
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ __ip_set_put_netlink(set);
+ }
+
ip_set_lock(set);
ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
ip_set_unlock(set);
@@ -2334,29 +2387,25 @@ ip_set_net_init(struct net *net)
}
static void __net_exit
-ip_set_net_exit(struct net *net)
+ip_set_net_pre_exit(struct net *net)
{
struct ip_set_net *inst = ip_set_pernet(net);
- struct ip_set *set = NULL;
- ip_set_id_t i;
-
inst->is_deleted = true; /* flag for ip_set_nfnl_put */
+}
- nfnl_lock(NFNL_SUBSYS_IPSET);
- for (i = 0; i < inst->ip_set_max; i++) {
- set = ip_set(inst, i);
- if (set) {
- ip_set(inst, i) = NULL;
- ip_set_destroy_set(set);
- }
- }
- nfnl_unlock(NFNL_SUBSYS_IPSET);
+static void __net_exit
+ip_set_net_exit(struct net *net)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ _destroy_all_sets(inst);
kvfree(rcu_dereference_protected(inst->ip_set_list, 1));
}
static struct pernet_operations ip_set_net_ops = {
.init = ip_set_net_init,
+ .pre_exit = ip_set_net_pre_exit,
.exit = ip_set_net_exit,
.id = &ip_set_net_id,
.size = sizeof(struct ip_set_net),
@@ -2395,8 +2444,11 @@ ip_set_fini(void)
{
nf_unregister_sockopt(&so_set);
nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-
unregister_pernet_subsys(&ip_set_net_ops);
+
+ /* Wait for call_rcu() in destroy */
+ rcu_barrier();
+
pr_debug("these are the famous last words\n");
}
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 7c2399541771..5e4453e9ef8e 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -5,6 +5,7 @@
#define _IP_SET_HASH_GEN_H
#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/jhash.h>
#include <linux/types.h>
#include <linux/netfilter/nfnetlink.h>
@@ -62,8 +63,8 @@ struct hbucket {
: jhash_size((htable_bits) - HTABLE_REGION_BITS))
#define ahash_sizeof_regions(htable_bits) \
(ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region))
-#define ahash_region(n, htable_bits) \
- ((n) % ahash_numof_locks(htable_bits))
+#define ahash_region(n) \
+ ((n) / jhash_size(HTABLE_REGION_BITS))
#define ahash_bucket_start(h, htable_bits) \
((htable_bits) < HTABLE_REGION_BITS ? 0 \
: (h) * jhash_size(HTABLE_REGION_BITS))
@@ -221,6 +222,7 @@ static const union nf_inet_addr zeromask = {};
#undef mtype_gc_do
#undef mtype_gc
#undef mtype_gc_init
+#undef mtype_cancel_gc
#undef mtype_variant
#undef mtype_data_match
@@ -265,6 +267,7 @@ static const union nf_inet_addr zeromask = {};
#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
@@ -429,7 +432,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
u32 i;
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference(hbucket(t, i));
+ n = (__force struct hbucket *)hbucket(t, i);
if (!n)
continue;
if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
@@ -449,10 +452,7 @@ mtype_destroy(struct ip_set *set)
struct htype *h = set->data;
struct list_head *l, *lt;
- if (SET_WITH_TIMEOUT(set))
- cancel_delayed_work_sync(&h->gc.dwork);
-
- mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
+ mtype_ahash_destroy(set, (__force struct htable *)h->table, true);
list_for_each_safe(l, lt, &h->ad) {
list_del(l);
kfree(l);
@@ -598,6 +598,15 @@ mtype_gc_init(struct htable_gc *gc)
queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
}
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+ struct htype *h = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ cancel_delayed_work_sync(&h->gc.dwork);
+}
+
static int
mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct ip_set_ext *mext, u32 flags);
@@ -693,7 +702,7 @@ retry:
#endif
key = HKEY(data, h->initval, htable_bits);
m = __ipset_dereference(hbucket(t, key));
- nr = ahash_region(key, htable_bits);
+ nr = ahash_region(key);
if (!m) {
m = kzalloc(sizeof(*m) +
AHASH_INIT_SIZE * dsize,
@@ -843,7 +852,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
key = HKEY(value, h->initval, t->htable_bits);
- r = ahash_region(key, t->htable_bits);
+ r = ahash_region(key);
atomic_inc(&t->uref);
elements = t->hregion[r].elements;
maxelem = t->maxelem;
@@ -1041,7 +1050,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
key = HKEY(value, h->initval, t->htable_bits);
- r = ahash_region(key, t->htable_bits);
+ r = ahash_region(key);
atomic_inc(&t->uref);
rcu_read_unlock_bh();
@@ -1440,6 +1449,7 @@ static const struct ip_set_type_variant mtype_variant = {
.uref = mtype_uref,
.resize = mtype_resize,
.same_set = mtype_same_set,
+ .cancel_gc = mtype_cancel_gc,
.region_lock = true,
};
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 031073286236..30a655e5c4fd 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -40,7 +40,7 @@ MODULE_ALIAS("ip_set_hash:net,iface");
#define IP_SET_HASH_WITH_MULTI
#define IP_SET_HASH_WITH_NET0
-#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ)
+#define STRSCPY(a, b) strscpy(a, b, IFNAMSIZ)
/* IPv4 variant */
@@ -138,9 +138,9 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next,
#include "ip_set_hash_gen.h"
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-static const char *get_physindev_name(const struct sk_buff *skb)
+static const char *get_physindev_name(const struct sk_buff *skb, struct net *net)
{
- struct net_device *dev = nf_bridge_get_physindev(skb);
+ struct net_device *dev = nf_bridge_get_physindev(skb, net);
return dev ? dev->name : NULL;
}
@@ -177,16 +177,16 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) :
get_physoutdev_name(skb);
if (!eiface)
return -EINVAL;
- STRLCPY(e.iface, eiface);
+ STRSCPY(e.iface, eiface);
e.physdev = 1;
#endif
} else {
- STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+ STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
}
if (strlen(e.iface) == 0)
@@ -395,16 +395,16 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) :
get_physoutdev_name(skb);
if (!eiface)
return -EINVAL;
- STRLCPY(e.iface, eiface);
+ STRSCPY(e.iface, eiface);
e.physdev = 1;
#endif
} else {
- STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+ STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
}
if (strlen(e.iface) == 0)
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 005a7ce87217..bf4f91b78e1d 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -36,6 +36,7 @@ MODULE_ALIAS("ip_set_hash:net,port,net");
#define IP_SET_HASH_WITH_PROTO
#define IP_SET_HASH_WITH_NETS
#define IPSET_NET_COUNT 2
+#define IP_SET_HASH_WITH_NET0
/* IPv4 variant */
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index e162636525cf..13c7a08aa868 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -79,7 +79,7 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb,
struct set_elem *e;
int ret;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -99,7 +99,7 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb,
struct set_elem *e;
int ret;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -188,9 +188,10 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct list_set *map = set->data;
struct set_adt_elem *d = value;
struct set_elem *e, *next, *prev = NULL;
- int ret;
+ int ret = 0;
- list_for_each_entry(e, &map->members, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -201,6 +202,7 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (d->before == 0) {
ret = 1;
+ goto out;
} else if (d->before > 0) {
next = list_next_entry(e, list);
ret = !list_is_last(&e->list, &map->members) &&
@@ -208,9 +210,11 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
} else {
ret = prev && prev->id == d->refid;
}
- return ret;
+ goto out;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static void
@@ -239,7 +243,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
/* Find where to add the new entry */
n = prev = next = NULL;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -316,9 +320,9 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext,
{
struct list_set *map = set->data;
struct set_adt_elem *d = value;
- struct set_elem *e, *next, *prev = NULL;
+ struct set_elem *e, *n, *next, *prev = NULL;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_safe(e, n, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -424,17 +428,8 @@ static void
list_set_destroy(struct ip_set *set)
{
struct list_set *map = set->data;
- struct set_elem *e, *n;
- if (SET_WITH_TIMEOUT(set))
- timer_shutdown_sync(&map->gc);
-
- list_for_each_entry_safe(e, n, &map->members, list) {
- list_del(&e->list);
- ip_set_put_byindex(map->net, e->id);
- ip_set_ext_destroy(set, e);
- kfree(e);
- }
+ WARN_ON_ONCE(!list_empty(&map->members));
kfree(map);
set->data = NULL;
@@ -545,6 +540,18 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
a->extensions == b->extensions;
}
+static void
+list_set_cancel_gc(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ timer_shutdown_sync(&map->gc);
+
+ /* Flush list to drop references to other ipsets */
+ list_set_flush(set);
+}
+
static const struct ip_set_type_variant set_variant = {
.kadt = list_set_kadt,
.uadt = list_set_uadt,
@@ -558,12 +565,13 @@ static const struct ip_set_type_variant set_variant = {
.head = list_set_head,
.list = list_set_list,
.same_set = list_set_same_set,
+ .cancel_gc = list_set_cancel_gc,
};
static void
list_set_gc(struct timer_list *t)
{
- struct list_set *map = from_timer(map, t, gc);
+ struct list_set *map = timer_container_of(map, t, gc);
struct ip_set *set = map->set;
spin_lock_bh(&set->lock);
@@ -603,6 +611,8 @@ init_list_set(struct net *net, struct ip_set *set, u32 size)
return true;
}
+static struct lock_class_key list_set_lockdep_key;
+
static int
list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
u32 flags)
@@ -619,6 +629,7 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
if (size < IP_SET_LIST_MIN_SIZE)
size = IP_SET_LIST_MIN_SIZE;
+ lockdep_set_class(&set->lock, &list_set_lockdep_key);
set->variant = &set_variant;
set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem),
__alignof__(struct set_elem));
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 271da8447b29..c203252e856d 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -44,7 +44,8 @@ config IP_VS_DEBUG
config IP_VS_TAB_BITS
int "IPVS connection table size (the Nth power of 2)"
- range 8 20
+ range 8 20 if !64BIT
+ range 8 27 if 64BIT
default 12
help
The IPVS connection hash table uses the chaining scheme to handle
@@ -54,24 +55,24 @@ config IP_VS_TAB_BITS
Note the table size must be power of 2. The table size will be the
value of 2 to the your input number power. The number to choose is
- from 8 to 20, the default number is 12, which means the table size
- is 4096. Don't input the number too small, otherwise you will lose
- performance on it. You can adapt the table size yourself, according
- to your virtual server application. It is good to set the table size
- not far less than the number of connections per second multiplying
- average lasting time of connection in the table. For example, your
- virtual server gets 200 connections per second, the connection lasts
- for 200 seconds in average in the connection table, the table size
- should be not far less than 200x200, it is good to set the table
- size 32768 (2**15).
+ from 8 to 27 for 64BIT(20 otherwise), the default number is 12,
+ which means the table size is 4096. Don't input the number too
+ small, otherwise you will lose performance on it. You can adapt the
+ table size yourself, according to your virtual server application.
+ It is good to set the table size not far less than the number of
+ connections per second multiplying average lasting time of
+ connection in the table. For example, your virtual server gets 200
+ connections per second, the connection lasts for 200 seconds in
+ average in the connection table, the table size should be not far
+ less than 200x200, it is good to set the table size 32768 (2**15).
Another note that each connection occupies 128 bytes effectively and
each hash entry uses 8 bytes, so you can estimate how much memory is
needed for your box.
You can overwrite this number setting conn_tab_bits module parameter
- or by appending ip_vs.conn_tab_bits=? to the kernel command line
- if IP VS was compiled built-in.
+ or by appending ip_vs.conn_tab_bits=? to the kernel command line if
+ IP VS was compiled built-in.
comment "IPVS transport protocol load balancing support"
@@ -104,7 +105,7 @@ config IP_VS_PROTO_AH
config IP_VS_PROTO_SCTP
bool "SCTP load balancing support"
- select LIBCRC32C
+ select NET_CRC32C
help
This option enables support for load balancing SCTP transport
protocol. Say Y if unsure.
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index fdacbc3c15be..d54d7da58334 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -13,8 +13,7 @@
* Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 13534e02346c..50cc492c7553 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -17,8 +17,7 @@
* Changes:
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/interrupt.h>
#include <linux/in.h>
@@ -26,12 +25,12 @@
#include <linux/net.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/vmalloc.h>
#include <linux/proc_fs.h> /* for proc_net_* */
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/jhash.h>
#include <linux/random.h>
+#include <linux/rcupdate_wait.h>
#include <net/net_namespace.h>
#include <net/ip_vs.h>
@@ -822,7 +821,7 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head)
/* Try to delete connection while not holding reference */
static void ip_vs_conn_del(struct ip_vs_conn *cp)
{
- if (del_timer(&cp->timer)) {
+ if (timer_delete(&cp->timer)) {
/* Drop cp->control chain too */
if (cp->control)
cp->timeout = 0;
@@ -833,7 +832,7 @@ static void ip_vs_conn_del(struct ip_vs_conn *cp)
/* Try to delete connection while holding reference */
static void ip_vs_conn_del_put(struct ip_vs_conn *cp)
{
- if (del_timer(&cp->timer)) {
+ if (timer_delete(&cp->timer)) {
/* Drop cp->control chain too */
if (cp->control)
cp->timeout = 0;
@@ -846,7 +845,7 @@ static void ip_vs_conn_del_put(struct ip_vs_conn *cp)
static void ip_vs_conn_expire(struct timer_list *t)
{
- struct ip_vs_conn *cp = from_timer(cp, t, timer);
+ struct ip_vs_conn *cp = timer_container_of(cp, t, timer);
struct netns_ipvs *ipvs = cp->ipvs;
/*
@@ -860,7 +859,7 @@ static void ip_vs_conn_expire(struct timer_list *t)
struct ip_vs_conn *ct = cp->control;
/* delete the timer if it is activated by other users */
- del_timer(&cp->timer);
+ timer_delete(&cp->timer);
/* does anybody control me? */
if (ct) {
@@ -885,7 +884,7 @@ static void ip_vs_conn_expire(struct timer_list *t)
* conntrack cleanup for the net.
*/
smp_rmb();
- if (ipvs->enable)
+ if (READ_ONCE(ipvs->enable))
ip_vs_conn_drop_conntrack(cp);
}
@@ -926,7 +925,7 @@ static void ip_vs_conn_expire(struct timer_list *t)
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
/* Using mod_timer_pending will ensure the timer is not
- * modified after the final del_timer in ip_vs_conn_expire.
+ * modified after the final timer_delete in ip_vs_conn_expire.
*/
if (timer_pending(&cp->timer) &&
time_after(cp->timer.expires, jiffies))
@@ -1046,28 +1045,35 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
#ifdef CONFIG_PROC_FS
struct ip_vs_iter_state {
struct seq_net_private p;
- struct hlist_head *l;
+ unsigned int bucket;
+ unsigned int skip_elems;
};
-static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+static void *ip_vs_conn_array(struct ip_vs_iter_state *iter)
{
int idx;
struct ip_vs_conn *cp;
- struct ip_vs_iter_state *iter = seq->private;
- for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) {
+ unsigned int skip = 0;
+
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
/* __ip_vs_conn_get() is not needed by
* ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
*/
- if (pos-- == 0) {
- iter->l = &ip_vs_conn_tab[idx];
+ if (skip >= iter->skip_elems) {
+ iter->bucket = idx;
return cp;
}
+
+ ++skip;
}
+
+ iter->skip_elems = 0;
cond_resched_rcu();
}
+ iter->bucket = idx;
return NULL;
}
@@ -1076,9 +1082,14 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
{
struct ip_vs_iter_state *iter = seq->private;
- iter->l = NULL;
rcu_read_lock();
- return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+ if (*pos == 0) {
+ iter->skip_elems = 0;
+ iter->bucket = 0;
+ return SEQ_START_TOKEN;
+ }
+
+ return ip_vs_conn_array(iter);
}
static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -1086,28 +1097,22 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct ip_vs_conn *cp = v;
struct ip_vs_iter_state *iter = seq->private;
struct hlist_node *e;
- struct hlist_head *l = iter->l;
- int idx;
++*pos;
if (v == SEQ_START_TOKEN)
- return ip_vs_conn_array(seq, 0);
+ return ip_vs_conn_array(iter);
/* more on same hash chain? */
e = rcu_dereference(hlist_next_rcu(&cp->c_list));
- if (e)
+ if (e) {
+ iter->skip_elems++;
return hlist_entry(e, struct ip_vs_conn, c_list);
-
- idx = l - ip_vs_conn_tab;
- while (++idx < ip_vs_conn_tab_size) {
- hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
- iter->l = &ip_vs_conn_tab[idx];
- return cp;
- }
- cond_resched_rcu();
}
- iter->l = NULL;
- return NULL;
+
+ iter->skip_elems = 0;
+ iter->bucket++;
+
+ return ip_vs_conn_array(iter);
}
static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
@@ -1433,7 +1438,7 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
cond_resched_rcu();
/* netns clean up started, abort delayed work */
- if (!ipvs->enable)
+ if (!READ_ONCE(ipvs->enable))
break;
}
rcu_read_unlock();
@@ -1481,37 +1486,44 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
int __init ip_vs_conn_init(void)
{
+ size_t tab_array_size;
+ int max_avail;
+#if BITS_PER_LONG > 32
+ int max = 27;
+#else
+ int max = 20;
+#endif
+ int min = 8;
int idx;
- /* Compute size and mask */
- if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) {
- pr_info("conn_tab_bits not in [8, 20]. Using default value\n");
- ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
- }
+ max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT;
+ max_avail -= 2; /* ~4 in hash row */
+ max_avail -= 1; /* IPVS up to 1/2 of mem */
+ max_avail -= order_base_2(sizeof(struct ip_vs_conn));
+ max = clamp(max_avail, min, max);
+ ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max);
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
/*
* Allocate the connection hash table and initialize its list heads
*/
- ip_vs_conn_tab = vmalloc(array_size(ip_vs_conn_tab_size,
- sizeof(*ip_vs_conn_tab)));
+ tab_array_size = array_size(ip_vs_conn_tab_size,
+ sizeof(*ip_vs_conn_tab));
+ ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size,
+ sizeof(*ip_vs_conn_tab), GFP_KERNEL);
if (!ip_vs_conn_tab)
return -ENOMEM;
/* Allocate ip_vs_conn slab cache */
- ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
- sizeof(struct ip_vs_conn), 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN);
if (!ip_vs_conn_cachep) {
- vfree(ip_vs_conn_tab);
+ kvfree(ip_vs_conn_tab);
return -ENOMEM;
}
- pr_info("Connection hash table configured "
- "(size=%d, memory=%ldKbytes)\n",
- ip_vs_conn_tab_size,
- (long)(ip_vs_conn_tab_size*sizeof(*ip_vs_conn_tab))/1024);
+ pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n",
+ ip_vs_conn_tab_size, tab_array_size / 1024);
IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
sizeof(struct ip_vs_conn));
@@ -1534,5 +1546,5 @@ void ip_vs_conn_cleanup(void)
rcu_barrier();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
- vfree(ip_vs_conn_tab);
+ kvfree(ip_vs_conn_tab);
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 2fcc26507d69..90d56f92c0f6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -19,8 +19,7 @@
* Harald Welte don't use nfcache
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -1140,7 +1139,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
__be16 vport;
unsigned int flags;
- EnterFunction(12);
vaddr = &svc->addr;
vport = svc->port;
daddr = &iph->saddr;
@@ -1208,7 +1206,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
cp->flags, refcount_read(&cp->refcnt));
- LeaveFunction(12);
return cp;
}
@@ -1316,13 +1313,11 @@ after_nat:
ip_vs_update_conntrack(skb, cp, 0);
ip_vs_conn_put(cp);
- LeaveFunction(11);
return NF_ACCEPT;
drop:
ip_vs_conn_put(cp);
kfree_skb(skb);
- LeaveFunction(11);
return NF_STOLEN;
}
@@ -1341,8 +1336,6 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat
int af = state->pf;
struct sock *sk;
- EnterFunction(11);
-
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
@@ -1352,16 +1345,13 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat
if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
+ if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk))
return NF_ACCEPT;
}
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
- if (!ipvs->enable)
- return NF_ACCEPT;
-
ip_vs_fill_iph_skb(af, skb, false, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -1556,6 +1546,7 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
if (!dest)
goto unk;
if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
__be16 type;
/* Only support version 0 and C (csum) */
@@ -1566,7 +1557,10 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
if (type != htons(ETH_P_IP))
goto unk;
*proto = IPPROTO_IPIP;
- return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags));
+
+ gre_flags_to_tnl_flags(flags, greh->flags);
+
+ return gre_calc_hlen(flags);
}
unk:
@@ -1942,7 +1936,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state
return NF_ACCEPT;
}
/* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ if (unlikely(sysctl_backup_only(ipvs)))
return NF_ACCEPT;
ip_vs_fill_iph_skb(af, skb, false, &iph);
@@ -1952,7 +1946,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state
if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
+ if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk))
return NF_ACCEPT;
}
@@ -2110,7 +2104,7 @@ ip_vs_forward_icmp(void *priv, struct sk_buff *skb,
int r;
/* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ if (unlikely(sysctl_backup_only(ipvs)))
return NF_ACCEPT;
if (state->pf == NFPROTO_IPV4) {
@@ -2297,7 +2291,7 @@ static int __net_init __ip_vs_init(struct net *net)
return -ENOMEM;
/* Hold the beast until a service is registered */
- ipvs->enable = 0;
+ WRITE_ONCE(ipvs->enable, 0);
ipvs->net = net;
/* Counters used for creating unique names */
ipvs->gen = atomic_read(&ipvs_netns_cnt);
@@ -2365,16 +2359,14 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
struct netns_ipvs *ipvs;
struct net *net;
- EnterFunction(2);
list_for_each_entry(net, net_list, exit_list) {
ipvs = net_ipvs(net);
ip_vs_unregister_hooks(ipvs, AF_INET);
ip_vs_unregister_hooks(ipvs, AF_INET6);
- ipvs->enable = 0; /* Disable packet reception */
+ WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */
smp_wmb();
ip_vs_sync_net_cleanup(ipvs);
}
- LeaveFunction(2);
}
static struct pernet_operations ipvs_core_ops = {
@@ -2458,3 +2450,4 @@ static void __exit ip_vs_cleanup(void)
module_init(ip_vs_init);
module_exit(ip_vs_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Virtual Server");
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 2a5ed71c82c3..068702894377 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -13,8 +13,7 @@
* Changes:
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/init.h>
@@ -94,6 +93,7 @@ static void update_defense_level(struct netns_ipvs *ipvs)
{
struct sysinfo i;
int availmem;
+ int amemthresh;
int nomem;
int to_change = -1;
@@ -105,7 +105,8 @@ static void update_defense_level(struct netns_ipvs *ipvs)
/* si_swapinfo(&i); */
/* availmem = availmem - (i.totalswap - i.freeswap); */
- nomem = (availmem < ipvs->sysctl_amemthresh);
+ amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0);
+ nomem = (availmem < amemthresh);
local_bh_disable();
@@ -145,9 +146,8 @@ static void update_defense_level(struct netns_ipvs *ipvs)
break;
case 1:
if (nomem) {
- ipvs->drop_rate = ipvs->drop_counter
- = ipvs->sysctl_amemthresh /
- (ipvs->sysctl_amemthresh-availmem);
+ ipvs->drop_counter = amemthresh / (amemthresh - availmem);
+ ipvs->drop_rate = ipvs->drop_counter;
ipvs->sysctl_drop_packet = 2;
} else {
ipvs->drop_rate = 0;
@@ -155,9 +155,8 @@ static void update_defense_level(struct netns_ipvs *ipvs)
break;
case 2:
if (nomem) {
- ipvs->drop_rate = ipvs->drop_counter
- = ipvs->sysctl_amemthresh /
- (ipvs->sysctl_amemthresh-availmem);
+ ipvs->drop_counter = amemthresh / (amemthresh - availmem);
+ ipvs->drop_rate = ipvs->drop_counter;
} else {
ipvs->drop_rate = 0;
ipvs->sysctl_drop_packet = 1;
@@ -256,7 +255,7 @@ static void est_reload_work_handler(struct work_struct *work)
struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
/* netns clean up started, abort delayed work */
- if (!ipvs->enable)
+ if (!READ_ONCE(ipvs->enable))
goto unlock;
if (!kd)
continue;
@@ -848,7 +847,7 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
{
struct ip_vs_dest *dest, *nxt;
- del_timer_sync(&ipvs->dest_trash_timer);
+ timer_delete_sync(&ipvs->dest_trash_timer);
/* No need to use dest_trash_lock */
list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
list_del(&dest->t_list);
@@ -1061,8 +1060,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
unsigned int atype;
int ret;
- EnterFunction(2);
-
#ifdef CONFIG_IP_VS_IPV6
if (udest->af == AF_INET6) {
atype = ipv6_addr_type(&udest->addr.in6);
@@ -1111,7 +1108,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
spin_lock_init(&dest->dst_lock);
__ip_vs_update_dest(svc, dest, udest, 1);
- LeaveFunction(2);
return 0;
err_stats:
@@ -1134,8 +1130,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
__be16 dport = udest->port;
int ret;
- EnterFunction(2);
-
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
@@ -1183,7 +1177,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
if (ret < 0)
- goto err;
+ return ret;
__ip_vs_update_dest(svc, dest, udest, 1);
} else {
/*
@@ -1192,9 +1186,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ret = ip_vs_new_dest(svc, udest);
}
-err:
- LeaveFunction(2);
-
return ret;
}
@@ -1209,8 +1200,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
union nf_inet_addr daddr;
__be16 dport = udest->port;
- EnterFunction(2);
-
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
@@ -1242,7 +1231,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
}
__ip_vs_update_dest(svc, dest, udest, 0);
- LeaveFunction(2);
return 0;
}
@@ -1317,8 +1305,6 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
struct ip_vs_dest *dest;
__be16 dport = udest->port;
- EnterFunction(2);
-
/* We use function that requires RCU lock */
rcu_read_lock();
dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
@@ -1339,14 +1325,13 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
*/
__ip_vs_del_dest(svc->ipvs, dest, false);
- LeaveFunction(2);
-
return 0;
}
static void ip_vs_dest_trash_expire(struct timer_list *t)
{
- struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer);
+ struct netns_ipvs *ipvs = timer_container_of(ipvs, t,
+ dest_trash_timer);
struct ip_vs_dest *dest, *next;
unsigned long now = jiffies;
@@ -1474,18 +1459,18 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
if (ret < 0)
goto out_err;
- /* Bind the ct retriever */
- RCU_INIT_POINTER(svc->pe, pe);
- pe = NULL;
-
/* Update the virtual service counters */
if (svc->port == FTPPORT)
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
- if (svc->pe && svc->pe->conn_out)
+ if (pe && pe->conn_out)
atomic_inc(&ipvs->conn_out_counter);
+ /* Bind the ct retriever */
+ RCU_INIT_POINTER(svc->pe, pe);
+ pe = NULL;
+
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services++;
@@ -1497,9 +1482,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
*svc_p = svc;
- if (!ipvs->enable) {
+ if (!READ_ONCE(ipvs->enable)) {
/* Now there is a service - full throttle */
- ipvs->enable = 1;
+ WRITE_ONCE(ipvs->enable, 1);
/* Start estimation for first time */
ip_vs_est_reload_start(ipvs);
@@ -1746,7 +1731,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list)
struct netns_ipvs *ipvs;
struct net *net;
- EnterFunction(2);
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
list_for_each_entry(net, net_list, exit_list) {
@@ -1754,7 +1738,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list)
ip_vs_flush(ipvs, true);
}
mutex_unlock(&__ip_vs_mutex);
- LeaveFunction(2);
}
/* Put all references for device (dst_cache) */
@@ -1792,7 +1775,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
- EnterFunction(2);
mutex_lock(&__ip_vs_mutex);
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
@@ -1821,7 +1803,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
}
spin_unlock_bh(&ipvs->dest_trash_lock);
mutex_unlock(&__ip_vs_mutex);
- LeaveFunction(2);
return NOTIFY_DONE;
}
@@ -1865,7 +1846,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
#ifdef CONFIG_SYSCTL
static int
-proc_do_defense_mode(struct ctl_table *table, int write,
+proc_do_defense_mode(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct netns_ipvs *ipvs = table->extra2;
@@ -1892,9 +1873,10 @@ proc_do_defense_mode(struct ctl_table *table, int write,
}
static int
-proc_do_sync_threshold(struct ctl_table *table, int write,
+proc_do_sync_threshold(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
+ struct netns_ipvs *ipvs = table->extra2;
int *valp = table->data;
int val[2];
int rc;
@@ -1904,6 +1886,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write,
.mode = table->mode,
};
+ mutex_lock(&ipvs->sync_mutex);
memcpy(val, valp, sizeof(val));
rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (write) {
@@ -1913,11 +1896,12 @@ proc_do_sync_threshold(struct ctl_table *table, int write,
else
memcpy(valp, val, sizeof(val));
}
+ mutex_unlock(&ipvs->sync_mutex);
return rc;
}
static int
-proc_do_sync_ports(struct ctl_table *table, int write,
+proc_do_sync_ports(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
@@ -1940,7 +1924,8 @@ proc_do_sync_ports(struct ctl_table *table, int write,
return rc;
}
-static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer)
+static int ipvs_proc_est_cpumask_set(const struct ctl_table *table,
+ void *buffer)
{
struct netns_ipvs *ipvs = table->extra2;
cpumask_var_t *valp = table->data;
@@ -1978,8 +1963,8 @@ out:
return ret;
}
-static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer,
- size_t size)
+static int ipvs_proc_est_cpumask_get(const struct ctl_table *table,
+ void *buffer, size_t size)
{
struct netns_ipvs *ipvs = table->extra2;
cpumask_var_t *valp = table->data;
@@ -1999,7 +1984,7 @@ static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer,
return ret;
}
-static int ipvs_proc_est_cpulist(struct ctl_table *table, int write,
+static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2026,7 +2011,7 @@ static int ipvs_proc_est_cpulist(struct ctl_table *table, int write,
return ret;
}
-static int ipvs_proc_est_nice(struct ctl_table *table, int write,
+static int ipvs_proc_est_nice(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct netns_ipvs *ipvs = table->extra2;
@@ -2056,7 +2041,7 @@ static int ipvs_proc_est_nice(struct ctl_table *table, int write,
return ret;
}
-static int ipvs_proc_run_estimation(struct ctl_table *table, int write,
+static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct netns_ipvs *ipvs = table->extra2;
@@ -2279,7 +2264,6 @@ static struct ctl_table vs_vars[] = {
.proc_handler = proc_dointvec,
},
#endif
- { }
};
#endif
@@ -3107,12 +3091,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IP_VS_SO_GET_SERVICES:
{
struct ip_vs_get_services *get;
- int size;
+ size_t size;
get = (struct ip_vs_get_services *)arg;
size = struct_size(get, entrytable, get->num_services);
if (*len != size) {
- pr_err("length: %u != %u\n", *len, size);
+ pr_err("length: %u != %zu\n", *len, size);
ret = -EINVAL;
goto out;
}
@@ -3148,12 +3132,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IP_VS_SO_GET_DESTS:
{
struct ip_vs_get_dests *get;
- int size;
+ size_t size;
get = (struct ip_vs_get_dests *)arg;
size = struct_size(get, entrytable, get->num_dests);
if (*len != size) {
- pr_err("length: %u != %u\n", *len, size);
+ pr_err("length: %u != %zu\n", *len, size);
ret = -EINVAL;
goto out;
}
@@ -3678,10 +3662,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
udest->port = nla_get_be16(nla_port);
- if (nla_addr_family)
- udest->af = nla_get_u16(nla_addr_family);
- else
- udest->af = 0;
+ udest->af = nla_get_u16_default(nla_addr_family, 0);
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
@@ -4285,6 +4266,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
struct net *net = ipvs->net;
struct ctl_table *tbl;
int idx, ret;
+ size_t ctl_table_size = ARRAY_SIZE(vs_vars);
+ bool unpriv = net->user_ns != &init_user_ns;
atomic_set(&ipvs->dropentry, 0);
spin_lock_init(&ipvs->dropentry_lock);
@@ -4299,10 +4282,6 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- tbl[0].procname = NULL;
} else
tbl = vs_vars;
/* Initialize sysctl defaults */
@@ -4328,10 +4307,17 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
ipvs->sysctl_sync_ports = 1;
tbl[idx++].data = &ipvs->sysctl_sync_ports;
tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
+
ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
+ if (unpriv)
+ tbl[idx].mode = 0444;
tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
+
ipvs->sysctl_sync_sock_size = 0;
+ if (unpriv)
+ tbl[idx].mode = 0444;
tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
+
tbl[idx++].data = &ipvs->sysctl_cache_bypass;
tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
@@ -4340,6 +4326,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
tbl[idx].data = &ipvs->sysctl_sync_threshold;
+ tbl[idx].extra2 = ipvs;
tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
@@ -4353,15 +4340,22 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
+
ipvs->sysctl_run_estimation = 1;
+ if (unpriv)
+ tbl[idx].mode = 0444;
tbl[idx].extra2 = ipvs;
tbl[idx++].data = &ipvs->sysctl_run_estimation;
ipvs->est_cpulist_valid = 0;
+ if (unpriv)
+ tbl[idx].mode = 0444;
tbl[idx].extra2 = ipvs;
tbl[idx++].data = &ipvs->sysctl_est_cpulist;
ipvs->sysctl_est_nice = IPVS_EST_NICE;
+ if (unpriv)
+ tbl[idx].mode = 0444;
tbl[idx].extra2 = ipvs;
tbl[idx++].data = &ipvs->sysctl_est_nice;
@@ -4372,7 +4366,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
#endif
ret = -ENOMEM;
- ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
+ ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl,
+ ctl_table_size);
if (!ipvs->sysctl_hdr)
goto err;
ipvs->sysctl_tbl = tbl;
@@ -4537,8 +4532,6 @@ int __init ip_vs_control_init(void)
int idx;
int ret;
- EnterFunction(2);
-
/* Initialize svc_table, ip_vs_svc_fwm_table */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
@@ -4551,15 +4544,12 @@ int __init ip_vs_control_init(void)
if (ret < 0)
return ret;
- LeaveFunction(2);
return 0;
}
void ip_vs_control_cleanup(void)
{
- EnterFunction(2);
unregister_netdevice_notifier(&ip_vs_dst_notifier);
/* relying on common rcu_barrier() in ip_vs_cleanup() */
- LeaveFunction(2);
}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 5e6ec32aff2b..bb7aca4601ff 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -30,8 +30,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -270,3 +269,4 @@ static void __exit ip_vs_dh_cleanup(void)
module_init(ip_vs_dh_init);
module_exit(ip_vs_dh_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs destination hashing scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index ce2a1549b304..77f4f637ff67 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -12,8 +12,7 @@
* get_stats()) do the per cpu summing.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/kernel.h>
#include <linux/jiffies.h>
@@ -21,6 +20,7 @@
#include <linux/interrupt.h>
#include <linux/sysctl.h>
#include <linux/list.h>
+#include <linux/rcupdate_wait.h>
#include <net/ip_vs.h>
@@ -230,7 +230,7 @@ static int ip_vs_estimation_kthread(void *data)
void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
{
/* Ignore reloads before first service is added */
- if (!ipvs->enable)
+ if (!READ_ONCE(ipvs->enable))
return;
ip_vs_est_stopped_recalc(ipvs);
/* Bump the kthread configuration genid */
@@ -264,7 +264,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
}
set_user_nice(kd->task, sysctl_est_nice(ipvs));
- set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
+ if (sysctl_est_preferred_cpulist(ipvs))
+ kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
pr_info("starting estimator thread %d...\n", kd->id);
wake_up_process(kd->task);
@@ -304,7 +305,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
int i;
if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
- ipvs->enable && ipvs->est_max_threads)
+ READ_ONCE(ipvs->enable) && ipvs->est_max_threads)
return -EINVAL;
mutex_lock(&ipvs->est_mutex);
@@ -341,7 +342,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
}
/* Start kthread tasks only when services are present */
- if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
+ if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) {
ret = ip_vs_est_kthread_start(ipvs, kd);
if (ret < 0)
goto out;
@@ -484,7 +485,7 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
struct ip_vs_estimator *est = &stats->est;
int ret;
- if (!ipvs->est_max_threads && ipvs->enable)
+ if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable))
ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
est->ktid = -1;
@@ -549,7 +550,7 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
__set_bit(row, kd->avail);
if (!kd->tick_len[row]) {
RCU_INIT_POINTER(kd->ticks[row], NULL);
- kfree_rcu(td);
+ kfree_rcu(td, rcu_head);
}
kd->est_count--;
if (kd->est_count) {
@@ -661,7 +662,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
/* Wait for cpufreq frequency transition */
wait_event_idle_timeout(wq, kthread_should_stop(),
HZ / 50);
- if (!ipvs->enable || kthread_should_stop())
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
goto stop;
}
@@ -679,7 +680,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
rcu_read_unlock();
local_bh_enable();
- if (!ipvs->enable || kthread_should_stop())
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
goto stop;
cond_resched();
@@ -755,7 +756,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
mutex_lock(&ipvs->est_mutex);
for (id = 1; id < ipvs->est_kt_count; id++) {
/* netns clean up started, abort */
- if (!ipvs->enable)
+ if (!READ_ONCE(ipvs->enable))
goto unlock2;
kd = ipvs->est_kt_arr[id];
if (!kd)
@@ -785,7 +786,7 @@ last_kt:
id = ipvs->est_kt_count;
next_kt:
- if (!ipvs->enable || kthread_should_stop())
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
goto unlock;
id--;
if (id < 0)
diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c
index b846cc385279..d657b47c6511 100644
--- a/net/netfilter/ipvs/ip_vs_fo.c
+++ b/net/netfilter/ipvs/ip_vs_fo.c
@@ -8,8 +8,7 @@
* Kenny Mathis : added initial functionality based on weight
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -72,3 +71,4 @@ static void __exit ip_vs_fo_cleanup(void)
module_init(ip_vs_fo_init);
module_exit(ip_vs_fo_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs weighted failover scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index ef1f45e43b63..b315c608fda4 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -16,8 +16,7 @@
* Author: Wouter Gadeyne
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/moduleparam.h>
@@ -35,7 +34,7 @@
#include <linux/gfp.h>
#include <net/protocol.h>
#include <net/tcp.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/ip_vs.h>
@@ -53,6 +52,7 @@ enum {
IP_VS_FTP_EPSV,
};
+static bool exiting_module;
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
* First port is set to the default port.
@@ -605,7 +605,7 @@ static void __ip_vs_ftp_exit(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
- if (!ipvs)
+ if (!ipvs || !exiting_module)
return;
unregister_ip_vs_app(ipvs, &ip_vs_ftp);
@@ -627,6 +627,7 @@ static int __init ip_vs_ftp_init(void)
*/
static void __exit ip_vs_ftp_exit(void)
{
+ exiting_module = true;
unregister_pernet_subsys(&ip_vs_ftp_ops);
/* rcu_barrier() is called by netns */
}
@@ -635,3 +636,4 @@ static void __exit ip_vs_ftp_exit(void)
module_init(ip_vs_ftp_init);
module_exit(ip_vs_ftp_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs ftp helper");
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 1b87214d385e..e6c8ed0c92f6 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -34,8 +34,7 @@
* me to write this module.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -123,7 +122,6 @@ static struct ctl_table vs_vars_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
#endif
@@ -293,7 +291,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
*/
static void ip_vs_lblc_check_expire(struct timer_list *t)
{
- struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer);
+ struct ip_vs_lblc_table *tbl = timer_container_of(tbl, t,
+ periodic_timer);
struct ip_vs_service *svc = tbl->svc;
unsigned long now = jiffies;
int goal;
@@ -550,6 +549,7 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
static int __net_init __ip_vs_lblc_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ size_t vars_table_size = ARRAY_SIZE(vs_vars_table);
if (!ipvs)
return -ENOENT;
@@ -563,15 +563,16 @@ static int __net_init __ip_vs_lblc_init(struct net *net)
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
- ipvs->lblc_ctl_table[0].procname = NULL;
+ vars_table_size = 0;
} else
ipvs->lblc_ctl_table = vs_vars_table;
ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
- ipvs->lblc_ctl_header =
- register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
+ ipvs->lblc_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs",
+ ipvs->lblc_ctl_table,
+ vars_table_size);
if (!ipvs->lblc_ctl_header) {
if (!net_eq(net, &init_net))
kfree(ipvs->lblc_ctl_table);
@@ -628,3 +629,4 @@ static void __exit ip_vs_lblc_cleanup(void)
module_init(ip_vs_lblc_init);
module_exit(ip_vs_lblc_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs locality-based least-connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index ad8f5fea6d3a..a25cf7bb6185 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -32,8 +32,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/module.h>
@@ -294,7 +293,6 @@ static struct ctl_table vs_vars_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
#endif
@@ -457,7 +455,8 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
*/
static void ip_vs_lblcr_check_expire(struct timer_list *t)
{
- struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer);
+ struct ip_vs_lblcr_table *tbl = timer_container_of(tbl, t,
+ periodic_timer);
struct ip_vs_service *svc = tbl->svc;
unsigned long now = jiffies;
int goal;
@@ -736,6 +735,7 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
static int __net_init __ip_vs_lblcr_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ size_t vars_table_size = ARRAY_SIZE(vs_vars_table);
if (!ipvs)
return -ENOENT;
@@ -749,14 +749,15 @@ static int __net_init __ip_vs_lblcr_init(struct net *net)
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
- ipvs->lblcr_ctl_table[0].procname = NULL;
+ vars_table_size = 0;
} else
ipvs->lblcr_ctl_table = vs_vars_table;
ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
- ipvs->lblcr_ctl_header =
- register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table);
+ ipvs->lblcr_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs",
+ ipvs->lblcr_ctl_table,
+ vars_table_size);
if (!ipvs->lblcr_ctl_header) {
if (!net_eq(net, &init_net))
kfree(ipvs->lblcr_ctl_table);
@@ -813,3 +814,4 @@ static void __exit ip_vs_lblcr_cleanup(void)
module_init(ip_vs_lblcr_init);
module_exit(ip_vs_lblcr_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs locality-based least-connection with replication scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index 9d34d81fc6f1..38cc38c5d8bb 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -9,8 +9,7 @@
* Wensong Zhang : added any dest with weight=0 is quiesced
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -86,3 +85,4 @@ static void __exit ip_vs_lc_cleanup(void)
module_init(ip_vs_lc_init);
module_exit(ip_vs_lc_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs least connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index e3d7f5c879ce..f61f54004c9e 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -17,8 +17,7 @@ https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 08adcb222986..81974f69e5bb 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -30,8 +30,7 @@
* PASV response can not be NAT-ed) but Active FTP should work
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/types.h>
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index f56862a87518..ada158c610ce 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -26,8 +26,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -136,3 +135,4 @@ static void __exit ip_vs_nq_cleanup(void)
module_init(ip_vs_nq_init);
module_exit(ip_vs_nq_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs never queue scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c
index c03066fdd5ca..c5c67df80a0b 100644
--- a/net/netfilter/ipvs/ip_vs_ovf.c
+++ b/net/netfilter/ipvs/ip_vs_ovf.c
@@ -12,8 +12,7 @@
* active connections
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -79,3 +78,4 @@ static void __exit ip_vs_ovf_cleanup(void)
module_init(ip_vs_ovf_init);
module_exit(ip_vs_ovf_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs overflow connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 166c669f0763..3035079ebd99 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/spinlock.h>
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 0ac6705a61d3..85f31d71e29a 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -185,3 +184,4 @@ static void __exit ip_vs_sip_cleanup(void)
module_init(ip_vs_sip_init);
module_exit(ip_vs_sip_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs sip helper");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index f100da4ba3bc..fd9dbca24c85 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -8,8 +8,7 @@
* Changes:
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -340,7 +339,7 @@ void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs)
int __init ip_vs_protocol_init(void)
{
- char protocols[64];
+ char protocols[64] = { 0 };
#define REGISTER_PROTOCOL(p) \
do { \
register_ip_vs_protocol(p); \
@@ -348,8 +347,6 @@ int __init ip_vs_protocol_init(void)
strcat(protocols, (p)->name); \
} while (0)
- protocols[0] = '\0';
- protocols[2] = '\0';
#ifdef CONFIG_IP_VS_PROTO_TCP
REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 89602c16f6b6..44e14acc187e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -6,8 +6,7 @@
* Wensong Zhang <wensong@linuxvirtualserver.org>
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/in.h>
#include <linux/ip.h>
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index a0921adc31a9..83e452916403 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -126,7 +126,8 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
if (sctph->source != cp->vport || payload_csum ||
skb->ip_summed == CHECKSUM_PARTIAL) {
sctph->source = cp->vport;
- sctp_nat_csum(skb, sctph, sctphoff);
+ if (!skb_is_gso(skb))
+ sctp_nat_csum(skb, sctph, sctphoff);
} else {
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
@@ -174,7 +175,8 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
(skb->ip_summed == CHECKSUM_PARTIAL &&
!(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) {
sctph->dest = cp->dport;
- sctp_nat_csum(skb, sctph, sctphoff);
+ if (!skb_is_gso(skb))
+ sctp_nat_csum(skb, sctph, sctphoff);
} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 7da51390cea6..f68a1533ee45 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -13,8 +13,7 @@
* protocol ip_vs_proto_data and is handled by netns
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/kernel.h>
#include <linux/ip.h>
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 68260d91c988..0f0107c80dd2 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -9,8 +9,7 @@
* Network name space (netns) aware.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/in.h>
#include <linux/ip.h>
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index 38495c6f6c7c..4125ee561cdc 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -14,8 +14,7 @@
* Wensong Zhang : added any dest with weight=0 is quiesced
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -122,4 +121,5 @@ static void __exit ip_vs_rr_cleanup(void)
module_init(ip_vs_rr_init);
module_exit(ip_vs_rr_cleanup);
+MODULE_DESCRIPTION("ipvs round-robin scheduler");
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index d4903723be7e..c6e421c4e299 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -12,8 +12,7 @@
* Changes:
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/spinlock.h>
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index 7663288e5358..245a323c84cd 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -30,8 +30,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -137,3 +136,4 @@ static void __exit ip_vs_sed_cleanup(void)
module_init(ip_vs_sed_init);
module_exit(ip_vs_sed_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs shortest expected delay scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index c2028e412092..0e85e07e23b9 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -32,8 +32,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -376,3 +375,4 @@ static void __exit ip_vs_sh_cleanup(void)
module_init(ip_vs_sh_init);
module_exit(ip_vs_sh_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs source hashing scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 4963fec815da..54dd1514ac45 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -32,8 +32,7 @@
* Persistence support, fwmark and time-out.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/slab.h>
@@ -51,7 +50,7 @@
#include <linux/kernel.h>
#include <linux/sched/signal.h>
-#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
+#include <linux/unaligned.h> /* Used for ntoh_seq and hton_seq */
#include <net/ip.h>
#include <net/sock.h>
@@ -603,7 +602,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
struct ip_vs_sync_conn_options *opt =
(struct ip_vs_sync_conn_options *)&s[1];
- memcpy(opt, &cp->in_seq, sizeof(*opt));
+ memcpy(opt, &cp->sync_conn_opt, sizeof(*opt));
}
m->nr_conns++;
@@ -1297,20 +1296,14 @@ static void set_sock_size(struct sock *sk, int mode, int val)
*/
static void set_mcast_loop(struct sock *sk, u_char loop)
{
- struct inet_sock *inet = inet_sk(sk);
-
/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
- lock_sock(sk);
- inet->mc_loop = loop ? 1 : 0;
+ inet_assign_bit(MC_LOOP, sk, loop);
#ifdef CONFIG_IP_VS_IPV6
- if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
+ if (READ_ONCE(sk->sk_family) == AF_INET6) {
/* IPV6_MULTICAST_LOOP */
- np->mc_loop = loop ? 1 : 0;
+ inet6_assign_bit(MC6_LOOP, sk, loop);
}
#endif
- release_sock(sk);
}
/*
@@ -1322,13 +1315,13 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
lock_sock(sk);
- inet->mc_ttl = ttl;
+ WRITE_ONCE(inet->mc_ttl, ttl);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MULTICAST_HOPS */
- np->mcast_hops = ttl;
+ WRITE_ONCE(np->mcast_hops, ttl);
}
#endif
release_sock(sk);
@@ -1341,13 +1334,13 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
lock_sock(sk);
- inet->pmtudisc = val;
+ WRITE_ONCE(inet->pmtudisc, val);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MTU_DISCOVER */
- np->pmtudisc = val;
+ WRITE_ONCE(np->pmtudisc, val);
}
#endif
release_sock(sk);
@@ -1371,7 +1364,7 @@ static int set_mcast_if(struct sock *sk, struct net_device *dev)
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MULTICAST_IF */
- np->mcast_oif = dev->ifindex;
+ WRITE_ONCE(np->mcast_oif, dev->ifindex);
}
#endif
release_sock(sk);
@@ -1441,7 +1434,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
sin.sin_addr.s_addr = addr;
sin.sin_port = 0;
- return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+ return kernel_bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin));
}
static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
@@ -1507,8 +1500,8 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id,
}
get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
- result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
- salen, 0);
+ result = kernel_connect(sock, (struct sockaddr_unsized *)&mcast_addr,
+ salen, 0);
if (result < 0) {
pr_err("Error connecting to the multicast addr\n");
goto error;
@@ -1548,7 +1541,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id,
get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
sock->sk->sk_bound_dev_if = dev->ifindex;
- result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
+ result = kernel_bind(sock, (struct sockaddr_unsized *)&mcast_addr, salen);
if (result < 0) {
pr_err("Error binding to the multicast addr\n");
goto error;
@@ -1582,13 +1575,11 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
struct kvec iov;
int len;
- EnterFunction(7);
iov.iov_base = (void *)buffer;
iov.iov_len = length;
len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
- LeaveFunction(7);
return len;
}
@@ -1614,15 +1605,12 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
struct kvec iov = {buffer, buflen};
int len;
- EnterFunction(7);
-
/* Receive a packet */
iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen);
len = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
if (len < 0)
return len;
- LeaveFunction(7);
return len;
}
diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c
index 3308e4cc740a..dbb7f5fd4688 100644
--- a/net/netfilter/ipvs/ip_vs_twos.c
+++ b/net/netfilter/ipvs/ip_vs_twos.c
@@ -4,8 +4,7 @@
* Authors: Darby Payne <darby.payne@applovin.com>
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/kernel.h>
#include <linux/module.h>
@@ -137,3 +136,4 @@ static void __exit ip_vs_twos_cleanup(void)
module_init(ip_vs_twos_init);
module_exit(ip_vs_twos_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs power of twos choice scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index 09f584b564a0..9da445ca09a1 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -14,8 +14,7 @@
* Wensong Zhang : added any dest with weight=0 is quiesced
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -109,3 +108,4 @@ static void __exit ip_vs_wlc_cleanup(void)
module_init(ip_vs_wlc_init);
module_exit(ip_vs_wlc_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs weighted least connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 1bc7a0789d85..99f09cbf2d9b 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -13,8 +13,7 @@
* with weight 0 when all weights are zero
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -263,3 +262,4 @@ static void __exit ip_vs_wrr_cleanup(void)
module_init(ip_vs_wrr_init);
module_exit(ip_vs_wrr_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs weighted round-robin scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 029171379884..3162ce3c2640 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -21,8 +21,7 @@
* - the only place where we can see skb->sk != NULL
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -97,7 +96,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest)
if (!dest_dst)
return NULL;
dst = dest_dst->dst_cache;
- if (dst->obsolete &&
+ if (READ_ONCE(dst->obsolete) &&
dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
return NULL;
return dest_dst;
@@ -119,13 +118,12 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
return false;
}
-/* Get route to daddr, update *saddr, optionally bind route to saddr */
+/* Get route to daddr, optionally bind route to saddr */
static struct rtable *do_output_route4(struct net *net, __be32 daddr,
- int rt_mode, __be32 *saddr)
+ int rt_mode, __be32 *ret_saddr)
{
struct flowi4 fl4;
struct rtable *rt;
- bool loop = false;
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
@@ -135,23 +133,17 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
retry:
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt)) {
- /* Invalid saddr ? */
- if (PTR_ERR(rt) == -EINVAL && *saddr &&
- rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
- *saddr = 0;
- flowi4_update_output(&fl4, 0, 0, daddr, 0);
- goto retry;
- }
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
return NULL;
- } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
+ }
+ if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
ip_rt_put(rt);
- *saddr = fl4.saddr;
- flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
- loop = true;
+ flowi4_update_output(&fl4, 0, daddr, fl4.saddr);
+ rt_mode = 0;
goto retry;
}
- *saddr = fl4.saddr;
+ if (ret_saddr)
+ *ret_saddr = fl4.saddr;
return rt;
}
@@ -180,7 +172,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
(!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
(addr_type & IPV6_ADDR_LOOPBACK);
old_rt_is_local = __ip_vs_is_local_route6(
- (struct rt6_info *)skb_dst(skb));
+ dst_rt6_info(skb_dst(skb)));
} else
#endif
{
@@ -271,7 +263,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
ICMPV6_EXC_HOPLIMIT, 0);
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
return false;
}
@@ -286,7 +278,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
{
if (ip_hdr(skb)->ttl <= 1) {
/* Tell the sender its packet died... */
- __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
return false;
}
@@ -318,7 +310,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
if (dest) {
dest_dst = __ip_vs_dst_check(dest);
if (likely(dest_dst))
- rt = (struct rtable *) dest_dst->dst_cache;
+ rt = dst_rtable(dest_dst->dst_cache);
else {
dest_dst = ip_vs_dest_dst_alloc();
spin_lock_bh(&dest->dst_lock);
@@ -339,24 +331,20 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
&dest->addr.ip, &dest_dst->dst_saddr.ip,
- atomic_read(&rt->dst.__refcnt));
+ rcuref_read(&rt->dst.__rcuref));
}
if (ret_saddr)
*ret_saddr = dest_dst->dst_saddr.ip;
} else {
- __be32 saddr = htonl(INADDR_ANY);
-
noref = 0;
/* For such unconfigured boxes avoid many route lookups
* for performance reasons because we do not remember saddr
*/
rt_mode &= ~IP_VS_RT_MODE_CONNECT;
- rt = do_output_route4(net, daddr, rt_mode, &saddr);
+ rt = do_output_route4(net, daddr, rt_mode, ret_saddr);
if (!rt)
goto err_unreach;
- if (ret_saddr)
- *ret_saddr = saddr;
}
local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
@@ -390,10 +378,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
skb->ip_summed == CHECKSUM_PARTIAL)
mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
- __be16 tflags = 0;
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
- tflags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
mtu -= gre_calc_hlen(tflags);
}
if (mtu < 68) {
@@ -481,7 +469,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
if (dest) {
dest_dst = __ip_vs_dst_check(dest);
if (likely(dest_dst))
- rt = (struct rt6_info *) dest_dst->dst_cache;
+ rt = dst_rt6_info(dest_dst->dst_cache);
else {
u32 cookie;
@@ -501,13 +489,13 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
ip_vs_dest_dst_free(dest_dst);
goto err_unreach;
}
- rt = (struct rt6_info *) dst;
+ rt = dst_rt6_info(dst);
cookie = rt6_get_cookie(rt);
__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
&dest->addr.in6, &dest_dst->dst_saddr.in6,
- atomic_read(&rt->dst.__refcnt));
+ rcuref_read(&rt->dst.__rcuref));
}
if (ret_saddr)
*ret_saddr = dest_dst->dst_saddr.in6;
@@ -517,7 +505,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
rt_mode);
if (!dst)
goto err_unreach;
- rt = (struct rt6_info *) dst;
+ rt = dst_rt6_info(dst);
}
local = __ip_vs_is_local_route6(rt);
@@ -553,10 +541,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
skb->ip_summed == CHECKSUM_PARTIAL)
mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
- __be16 tflags = 0;
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
- tflags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
mtu -= gre_calc_hlen(tflags);
}
if (mtu < IPV6_MIN_MTU) {
@@ -706,8 +694,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct iphdr *iph = ip_hdr(skb);
- EnterFunction(10);
-
if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
goto tx_error;
@@ -719,12 +705,10 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
@@ -735,8 +719,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct ipv6hdr *iph = ipv6_hdr(skb);
- EnterFunction(10);
-
if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
&iph->daddr, NULL,
ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
@@ -747,12 +729,10 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
#endif
@@ -768,8 +748,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rtable *rt; /* Route to the other host */
int local, rc, was_input;
- EnterFunction(10);
-
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
__be16 _pt, *p;
@@ -839,12 +817,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
- LeaveFunction(10);
return rc;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
@@ -856,8 +832,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rt6_info *rt; /* Route to the other host */
int local, rc;
- EnterFunction(10);
-
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
__be16 _pt, *p;
@@ -876,7 +850,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_RT_MODE_RDR);
if (local < 0)
goto tx_error;
- rt = (struct rt6_info *) skb_dst(skb);
+ rt = dst_rt6_info(skb_dst(skb));
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -927,11 +901,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
- LeaveFunction(10);
return rc;
tx_error:
- LeaveFunction(10);
kfree_skb(skb);
return NF_STOLEN;
}
@@ -994,7 +966,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
old_dsfield = ipv4_get_dsfield(old_iph);
*ttl = old_iph->ttl;
if (payload_len)
- *payload_len = ntohs(old_iph->tot_len);
+ *payload_len = skb_ip_totlen(skb);
}
/* Implement full-functionality option for ECN encapsulation */
@@ -1098,11 +1070,11 @@ ipvs_gre_encap(struct net *net, struct sk_buff *skb,
{
__be16 proto = *next_protocol == IPPROTO_IPIP ?
htons(ETH_P_IP) : htons(ETH_P_IPV6);
- __be16 tflags = 0;
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
size_t hdrlen;
if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
- tflags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
hdrlen = gre_calc_hlen(tflags);
gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
@@ -1149,8 +1121,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
int tun_type, gso_type;
int tun_flags;
- EnterFunction(10);
-
local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
@@ -1183,11 +1153,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
max_headroom += sizeof(struct udphdr) + gue_hdrlen;
} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
size_t gre_hdrlen;
- __be16 tflags = 0;
if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
- tflags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
gre_hdrlen = gre_calc_hlen(tflags);
max_headroom += gre_hdrlen;
@@ -1199,7 +1169,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
&next_protocol, NULL, &dsfield,
&ttl, dfp);
if (IS_ERR(skb))
- goto tx_error;
+ return NF_STOLEN;
gso_type = __tun_gso_type_mask(AF_INET, cp->af);
if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
@@ -1225,6 +1195,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->transport_header = skb->network_header;
skb_set_inner_ipproto(skb, next_protocol);
+ skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
bool check = false;
@@ -1267,14 +1238,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
else if (ret == NF_DROP)
kfree_skb(skb);
- LeaveFunction(10);
-
return NF_STOLEN;
tx_error:
- if (!IS_ERR(skb))
- kfree_skb(skb);
- LeaveFunction(10);
+ kfree_skb(skb);
return NF_STOLEN;
}
@@ -1298,8 +1265,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
int tun_type, gso_type;
int tun_flags;
- EnterFunction(10);
-
local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
&cp->daddr.in6,
&saddr, ipvsh, 1,
@@ -1311,7 +1276,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (local)
return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
- rt = (struct rt6_info *) skb_dst(skb);
+ rt = dst_rt6_info(skb_dst(skb));
tdev = rt->dst.dev;
/*
@@ -1333,11 +1298,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
max_headroom += sizeof(struct udphdr) + gue_hdrlen;
} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
size_t gre_hdrlen;
- __be16 tflags = 0;
if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
- tflags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
gre_hdrlen = gre_calc_hlen(tflags);
max_headroom += gre_hdrlen;
@@ -1347,7 +1312,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
&next_protocol, &payload_len,
&dsfield, &ttl, NULL);
if (IS_ERR(skb))
- goto tx_error;
+ return NF_STOLEN;
gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
@@ -1373,6 +1338,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->transport_header = skb->network_header;
skb_set_inner_ipproto(skb, next_protocol);
+ skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
bool check = false;
@@ -1414,14 +1380,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
else if (ret == NF_DROP)
kfree_skb(skb);
- LeaveFunction(10);
-
return NF_STOLEN;
tx_error:
- if (!IS_ERR(skb))
- kfree_skb(skb);
- LeaveFunction(10);
+ kfree_skb(skb);
return NF_STOLEN;
}
#endif
@@ -1437,8 +1399,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
{
int local;
- EnterFunction(10);
-
local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
@@ -1455,12 +1415,10 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
@@ -1471,8 +1429,6 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
{
int local;
- EnterFunction(10);
-
local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
&cp->daddr.in6,
NULL, ipvsh, 0,
@@ -1489,12 +1445,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
#endif
@@ -1514,8 +1468,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
int local;
int rt_mode, was_input;
- EnterFunction(10);
-
/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
forwarded directly here, because there is no need to
translate address/port back */
@@ -1526,7 +1478,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = NF_ACCEPT;
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
- goto out;
+ return rc;
}
/*
@@ -1582,14 +1534,11 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
- rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
- goto out;
+ return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
tx_error:
kfree_skb(skb);
rc = NF_STOLEN;
- out:
- LeaveFunction(10);
return rc;
}
@@ -1604,8 +1553,6 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
int local;
int rt_mode;
- EnterFunction(10);
-
/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
forwarded directly here, because there is no need to
translate address/port back */
@@ -1616,7 +1563,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = NF_ACCEPT;
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
- goto out;
+ return rc;
}
/*
@@ -1631,7 +1578,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
&cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
if (local < 0)
goto tx_error;
- rt = (struct rt6_info *) skb_dst(skb);
+ rt = dst_rt6_info(skb_dst(skb));
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -1671,14 +1618,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
- rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
- goto out;
+ return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
tx_error:
kfree_skb(skb);
rc = NF_STOLEN;
-out:
- LeaveFunction(10);
return rc;
}
#endif
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
new file mode 100644
index 000000000000..46e667a50d98
--- /dev/null
+++ b/net/netfilter/nf_bpf_link.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+#include <uapi/linux/netfilter_ipv4.h>
+
+static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
+ const struct nf_hook_state *s)
+{
+ const struct bpf_prog *prog = bpf_prog;
+ struct bpf_nf_ctx ctx = {
+ .state = s,
+ .skb = skb,
+ };
+
+ return bpf_prog_run_pin_on_cpu(prog, &ctx);
+}
+
+struct bpf_nf_link {
+ struct bpf_link link;
+ struct nf_hook_ops hook_ops;
+ netns_tracker ns_tracker;
+ struct net *net;
+ u32 dead;
+ const struct nf_defrag_hook *defrag_hook;
+};
+
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+static const struct nf_defrag_hook *
+get_proto_defrag_hook(struct bpf_nf_link *link,
+ const struct nf_defrag_hook __rcu **ptr_global_hook,
+ const char *mod)
+{
+ const struct nf_defrag_hook *hook;
+ int err;
+
+ /* RCU protects us from races against module unloading */
+ rcu_read_lock();
+ hook = rcu_dereference(*ptr_global_hook);
+ if (!hook) {
+ rcu_read_unlock();
+ err = request_module("%s", mod);
+ if (err)
+ return ERR_PTR(err < 0 ? err : -EINVAL);
+
+ rcu_read_lock();
+ hook = rcu_dereference(*ptr_global_hook);
+ }
+
+ if (hook && try_module_get(hook->owner)) {
+ /* Once we have a refcnt on the module, we no longer need RCU */
+ hook = rcu_pointer_handoff(hook);
+ } else {
+ WARN_ONCE(!hook, "%s has bad registration", mod);
+ hook = ERR_PTR(-ENOENT);
+ }
+ rcu_read_unlock();
+
+ if (!IS_ERR(hook)) {
+ err = hook->enable(link->net);
+ if (err) {
+ module_put(hook->owner);
+ hook = ERR_PTR(err);
+ }
+ }
+
+ return hook;
+}
+#endif
+
+static int bpf_nf_enable_defrag(struct bpf_nf_link *link)
+{
+ const struct nf_defrag_hook __maybe_unused *hook;
+
+ switch (link->hook_ops.pf) {
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+ case NFPROTO_IPV4:
+ hook = get_proto_defrag_hook(link, &nf_defrag_v4_hook, "nf_defrag_ipv4");
+ if (IS_ERR(hook))
+ return PTR_ERR(hook);
+
+ link->defrag_hook = hook;
+ return 0;
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ case NFPROTO_IPV6:
+ hook = get_proto_defrag_hook(link, &nf_defrag_v6_hook, "nf_defrag_ipv6");
+ if (IS_ERR(hook))
+ return PTR_ERR(hook);
+
+ link->defrag_hook = hook;
+ return 0;
+#endif
+ default:
+ return -EAFNOSUPPORT;
+ }
+}
+
+static void bpf_nf_disable_defrag(struct bpf_nf_link *link)
+{
+ const struct nf_defrag_hook *hook = link->defrag_hook;
+
+ if (!hook)
+ return;
+ hook->disable(link->net);
+ module_put(hook->owner);
+}
+
+static void bpf_nf_link_release(struct bpf_link *link)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+ if (nf_link->dead)
+ return;
+
+ /* do not double release in case .detach was already called */
+ if (!cmpxchg(&nf_link->dead, 0, 1)) {
+ nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
+ bpf_nf_disable_defrag(nf_link);
+ put_net_track(nf_link->net, &nf_link->ns_tracker);
+ }
+}
+
+static void bpf_nf_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+ kfree(nf_link);
+}
+
+static int bpf_nf_link_detach(struct bpf_link *link)
+{
+ bpf_nf_link_release(link);
+ return 0;
+}
+
+static void bpf_nf_link_show_info(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+ seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n",
+ nf_link->hook_ops.pf, nf_link->hook_ops.hooknum,
+ nf_link->hook_ops.priority);
+}
+
+static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+ const struct nf_defrag_hook *hook = nf_link->defrag_hook;
+
+ info->netfilter.pf = nf_link->hook_ops.pf;
+ info->netfilter.hooknum = nf_link->hook_ops.hooknum;
+ info->netfilter.priority = nf_link->hook_ops.priority;
+ info->netfilter.flags = hook ? BPF_F_NETFILTER_IP_DEFRAG : 0;
+
+ return 0;
+}
+
+static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ return -EOPNOTSUPP;
+}
+
+static const struct bpf_link_ops bpf_nf_link_lops = {
+ .release = bpf_nf_link_release,
+ .dealloc = bpf_nf_link_dealloc,
+ .detach = bpf_nf_link_detach,
+ .show_fdinfo = bpf_nf_link_show_info,
+ .fill_link_info = bpf_nf_link_fill_link_info,
+ .update_prog = bpf_nf_link_update,
+};
+
+static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr)
+{
+ int prio;
+
+ switch (attr->link_create.netfilter.pf) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ if (attr->link_create.netfilter.hooknum >= NF_INET_NUMHOOKS)
+ return -EPROTO;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG)
+ return -EOPNOTSUPP;
+
+ /* make sure conntrack confirm is always last */
+ prio = attr->link_create.netfilter.priority;
+ if (prio == NF_IP_PRI_FIRST)
+ return -ERANGE; /* sabotage_in and other warts */
+ else if (prio == NF_IP_PRI_LAST)
+ return -ERANGE; /* e.g. conntrack confirm */
+ else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) &&
+ prio <= NF_IP_PRI_CONNTRACK_DEFRAG)
+ return -ERANGE; /* cannot use defrag if prog runs before nf_defrag */
+
+ return 0;
+}
+
+int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct bpf_nf_link *link;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ err = bpf_nf_check_pf_and_hooks(attr);
+ if (err)
+ return err;
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link)
+ return -ENOMEM;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog,
+ attr->link_create.attach_type);
+
+ link->hook_ops.hook = nf_hook_run_bpf;
+ link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
+ link->hook_ops.priv = prog;
+
+ link->hook_ops.pf = attr->link_create.netfilter.pf;
+ link->hook_ops.priority = attr->link_create.netfilter.priority;
+ link->hook_ops.hooknum = attr->link_create.netfilter.hooknum;
+
+ link->net = net;
+ link->dead = false;
+ link->defrag_hook = NULL;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ return err;
+ }
+
+ if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) {
+ err = bpf_nf_enable_defrag(link);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ }
+
+ err = nf_register_net_hook(net, &link->hook_ops);
+ if (err) {
+ bpf_nf_disable_defrag(link);
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+
+ get_net_track(net, &link->ns_tracker, GFP_KERNEL);
+
+ return bpf_link_settle(&link_primer);
+}
+
+const struct bpf_prog_ops netfilter_prog_ops = {
+ .test_run = bpf_prog_test_run_nf,
+};
+
+static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name)
+{
+ struct btf *btf;
+ s32 type_id;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR_OR_NULL(btf))
+ return false;
+
+ type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
+ if (WARN_ON_ONCE(type_id < 0))
+ return false;
+
+ info->btf = btf;
+ info->btf_id = type_id;
+ info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ return true;
+}
+
+static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(struct bpf_nf_ctx))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_nf_ctx, skb):
+ if (size != sizeof_field(struct bpf_nf_ctx, skb))
+ return false;
+
+ return nf_ptr_to_btf_id(info, "sk_buff");
+ case bpf_ctx_range(struct bpf_nf_ctx, state):
+ if (size != sizeof_field(struct bpf_nf_ctx, state))
+ return false;
+
+ return nf_ptr_to_btf_id(info, "nf_hook_state");
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+static const struct bpf_func_proto *
+bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id, prog);
+}
+
+const struct bpf_verifier_ops netfilter_verifier_ops = {
+ .is_valid_access = nf_is_valid_access,
+ .get_func_proto = bpf_nf_func_proto,
+};
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 5d8ed6c90b7e..f1be4dd5cf85 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -122,17 +122,67 @@ find_or_evict(struct net *net, struct nf_conncount_list *list,
return ERR_PTR(-EAGAIN);
}
+static bool get_ct_or_tuple_from_skb(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conn **ct,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone **zone,
+ bool *refcounted)
+{
+ const struct nf_conntrack_tuple_hash *h;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *found_ct;
+
+ found_ct = nf_ct_get(skb, &ctinfo);
+ if (found_ct && !nf_ct_is_template(found_ct)) {
+ *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ *zone = nf_ct_zone(found_ct);
+ *ct = found_ct;
+ return true;
+ }
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple))
+ return false;
+
+ if (found_ct)
+ *zone = nf_ct_zone(found_ct);
+
+ h = nf_conntrack_find_get(net, *zone, tuple);
+ if (!h)
+ return true;
+
+ found_ct = nf_ct_tuplehash_to_ctrack(h);
+ *refcounted = true;
+ *ct = found_ct;
+
+ return true;
+}
+
static int __nf_conncount_add(struct net *net,
- struct nf_conncount_list *list,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conncount_list *list)
{
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
const struct nf_conntrack_tuple_hash *found;
struct nf_conncount_tuple *conn, *conn_n;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conn *ct = NULL;
struct nf_conn *found_ct;
unsigned int collect = 0;
+ bool refcounted = false;
+
+ if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted))
+ return -ENOENT;
- if (time_is_after_eq_jiffies((unsigned long)list->last_gc))
+ if (ct && nf_ct_is_confirmed(ct)) {
+ if (refcounted)
+ nf_ct_put(ct);
+ return -EEXIST;
+ }
+
+ if ((u32)jiffies == list->last_gc)
goto add_new_node;
/* check the saved connections */
@@ -144,10 +194,10 @@ static int __nf_conncount_add(struct net *net,
if (IS_ERR(found)) {
/* Not found, but might be about to be confirmed */
if (PTR_ERR(found) == -EAGAIN) {
- if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
+ if (nf_ct_tuple_equal(&conn->tuple, &tuple) &&
nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
nf_ct_zone_id(zone, zone->dir))
- return 0; /* already exists */
+ goto out_put; /* already exists */
} else {
collect++;
}
@@ -156,7 +206,7 @@ static int __nf_conncount_add(struct net *net,
found_ct = nf_ct_tuplehash_to_ctrack(found);
- if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
+ if (nf_ct_tuple_equal(&conn->tuple, &tuple) &&
nf_ct_zone_equal(found_ct, zone, zone->dir)) {
/*
* We should not see tuples twice unless someone hooks
@@ -165,7 +215,7 @@ static int __nf_conncount_add(struct net *net,
* Attempt to avoid a re-add in this case.
*/
nf_ct_put(found_ct);
- return 0;
+ goto out_put;
} else if (already_closed(found_ct)) {
/*
* we do not care about connections which are
@@ -188,31 +238,35 @@ add_new_node:
if (conn == NULL)
return -ENOMEM;
- conn->tuple = *tuple;
+ conn->tuple = tuple;
conn->zone = *zone;
conn->cpu = raw_smp_processor_id();
conn->jiffies32 = (u32)jiffies;
list_add_tail(&conn->node, &list->head);
list->count++;
list->last_gc = (u32)jiffies;
+
+out_put:
+ if (refcounted)
+ nf_ct_put(ct);
return 0;
}
-int nf_conncount_add(struct net *net,
- struct nf_conncount_list *list,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+int nf_conncount_add_skb(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conncount_list *list)
{
int ret;
/* check the saved connections */
spin_lock_bh(&list->list_lock);
- ret = __nf_conncount_add(net, list, tuple, zone);
+ ret = __nf_conncount_add(net, skb, l3num, list);
spin_unlock_bh(&list->list_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_conncount_add);
+EXPORT_SYMBOL_GPL(nf_conncount_add_skb);
void nf_conncount_list_init(struct nf_conncount_list *list)
{
@@ -224,8 +278,8 @@ void nf_conncount_list_init(struct nf_conncount_list *list)
EXPORT_SYMBOL_GPL(nf_conncount_list_init);
/* Return true if the list is empty. Must be called with BH disabled. */
-bool nf_conncount_gc_list(struct net *net,
- struct nf_conncount_list *list)
+static bool __nf_conncount_gc_list(struct net *net,
+ struct nf_conncount_list *list)
{
const struct nf_conntrack_tuple_hash *found;
struct nf_conncount_tuple *conn, *conn_n;
@@ -234,11 +288,7 @@ bool nf_conncount_gc_list(struct net *net,
bool ret = false;
/* don't bother if we just did GC */
- if (time_is_after_eq_jiffies((unsigned long)READ_ONCE(list->last_gc)))
- return false;
-
- /* don't bother if other cpu is already doing GC */
- if (!spin_trylock(&list->list_lock))
+ if ((u32)jiffies == READ_ONCE(list->last_gc))
return false;
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
@@ -269,7 +319,21 @@ bool nf_conncount_gc_list(struct net *net,
if (!list->count)
ret = true;
list->last_gc = (u32)jiffies;
- spin_unlock(&list->list_lock);
+
+ return ret;
+}
+
+bool nf_conncount_gc_list(struct net *net,
+ struct nf_conncount_list *list)
+{
+ bool ret;
+
+ /* don't bother if other cpu is already doing GC */
+ if (!spin_trylock_bh(&list->list_lock))
+ return false;
+
+ ret = __nf_conncount_gc_list(net, list);
+ spin_unlock_bh(&list->list_lock);
return ret;
}
@@ -309,20 +373,22 @@ static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
static unsigned int
insert_tree(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
struct nf_conncount_data *data,
struct rb_root *root,
unsigned int hash,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const u32 *key)
{
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+ bool do_gc = true, refcounted = false;
+ unsigned int count = 0, gc_count = 0;
struct rb_node **rbnode, *parent;
- struct nf_conncount_rb *rbconn;
+ struct nf_conntrack_tuple tuple;
struct nf_conncount_tuple *conn;
- unsigned int count = 0, gc_count = 0;
- u8 keylen = data->keylen;
- bool do_gc = true;
+ struct nf_conncount_rb *rbconn;
+ struct nf_conn *ct = NULL;
spin_lock_bh(&nf_conncount_locks[hash]);
restart:
@@ -333,7 +399,7 @@ restart:
rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
parent = *rbnode;
- diff = key_diff(key, rbconn->key, keylen);
+ diff = key_diff(key, rbconn->key, data->keylen);
if (diff < 0) {
rbnode = &((*rbnode)->rb_left);
} else if (diff > 0) {
@@ -341,8 +407,8 @@ restart:
} else {
int ret;
- ret = nf_conncount_add(net, &rbconn->list, tuple, zone);
- if (ret)
+ ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list);
+ if (ret && ret != -EEXIST)
count = 0; /* hotdrop */
else
count = rbconn->list.count;
@@ -365,28 +431,35 @@ restart:
goto restart;
}
- /* expected case: match, insert new node */
- rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
- if (rbconn == NULL)
- goto out_unlock;
+ if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) {
+ /* expected case: match, insert new node */
+ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
+ if (rbconn == NULL)
+ goto out_unlock;
- conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
- if (conn == NULL) {
- kmem_cache_free(conncount_rb_cachep, rbconn);
- goto out_unlock;
- }
+ conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
+ if (conn == NULL) {
+ kmem_cache_free(conncount_rb_cachep, rbconn);
+ goto out_unlock;
+ }
- conn->tuple = *tuple;
- conn->zone = *zone;
- memcpy(rbconn->key, key, sizeof(u32) * keylen);
+ conn->tuple = tuple;
+ conn->zone = *zone;
+ conn->cpu = raw_smp_processor_id();
+ conn->jiffies32 = (u32)jiffies;
+ memcpy(rbconn->key, key, sizeof(u32) * data->keylen);
- nf_conncount_list_init(&rbconn->list);
- list_add(&conn->node, &rbconn->list.head);
- count = 1;
- rbconn->list.count = count;
+ nf_conncount_list_init(&rbconn->list);
+ list_add(&conn->node, &rbconn->list.head);
+ count = 1;
+ rbconn->list.count = count;
- rb_link_node_rcu(&rbconn->node, parent, rbnode);
- rb_insert_color(&rbconn->node, root);
+ rb_link_node_rcu(&rbconn->node, parent, rbnode);
+ rb_insert_color(&rbconn->node, root);
+
+ if (refcounted)
+ nf_ct_put(ct);
+ }
out_unlock:
spin_unlock_bh(&nf_conncount_locks[hash]);
return count;
@@ -394,16 +467,15 @@ out_unlock:
static unsigned int
count_tree(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
struct nf_conncount_data *data,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const u32 *key)
{
struct rb_root *root;
struct rb_node *parent;
struct nf_conncount_rb *rbconn;
unsigned int hash;
- u8 keylen = data->keylen;
hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
root = &data->root[hash];
@@ -414,7 +486,7 @@ count_tree(struct net *net,
rbconn = rb_entry(parent, struct nf_conncount_rb, node);
- diff = key_diff(key, rbconn->key, keylen);
+ diff = key_diff(key, rbconn->key, data->keylen);
if (diff < 0) {
parent = rcu_dereference_raw(parent->rb_left);
} else if (diff > 0) {
@@ -422,7 +494,7 @@ count_tree(struct net *net,
} else {
int ret;
- if (!tuple) {
+ if (!skb) {
nf_conncount_gc_list(net, &rbconn->list);
return rbconn->list.count;
}
@@ -437,19 +509,23 @@ count_tree(struct net *net,
}
/* same source network -> be counted! */
- ret = __nf_conncount_add(net, &rbconn->list, tuple, zone);
+ ret = __nf_conncount_add(net, skb, l3num, &rbconn->list);
spin_unlock_bh(&rbconn->list.list_lock);
- if (ret)
+ if (ret && ret != -EEXIST) {
return 0; /* hotdrop */
- else
+ } else {
+ /* -EEXIST means add was skipped, update the list */
+ if (ret == -EEXIST)
+ nf_conncount_gc_list(net, &rbconn->list);
return rbconn->list.count;
+ }
}
}
- if (!tuple)
+ if (!skb)
return 0;
- return insert_tree(net, data, root, hash, key, tuple, zone);
+ return insert_tree(net, skb, l3num, data, root, hash, key);
}
static void tree_gc_worker(struct work_struct *work)
@@ -511,24 +587,24 @@ next:
}
/* Count and return number of conntrack entries in 'net' with particular 'key'.
- * If 'tuple' is not null, insert it into the accounting data structure.
- * Call with RCU read lock.
+ * If 'skb' is not null, insert the corresponding tuple into the accounting
+ * data structure. Call with RCU read lock.
*/
-unsigned int nf_conncount_count(struct net *net,
- struct nf_conncount_data *data,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+unsigned int nf_conncount_count_skb(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conncount_data *data,
+ const u32 *key)
{
- return count_tree(net, data, key, tuple, zone);
+ return count_tree(net, skb, l3num, data, key);
+
}
-EXPORT_SYMBOL_GPL(nf_conncount_count);
+EXPORT_SYMBOL_GPL(nf_conncount_count_skb);
-struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
- unsigned int keylen)
+struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen)
{
struct nf_conncount_data *data;
- int ret, i;
+ int i;
if (keylen % sizeof(u32) ||
keylen / sizeof(u32) > MAX_KEYLEN ||
@@ -541,12 +617,6 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
if (!data)
return ERR_PTR(-ENOMEM);
- ret = nf_ct_netns_get(net, family);
- if (ret < 0) {
- kfree(data);
- return ERR_PTR(ret);
- }
-
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
data->root[i] = RB_ROOT;
@@ -583,13 +653,11 @@ static void destroy_tree(struct rb_root *r)
}
}
-void nf_conncount_destroy(struct net *net, unsigned int family,
- struct nf_conncount_data *data)
+void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data)
{
unsigned int i;
cancel_work_sync(&data->gc_work);
- nf_ct_netns_put(net, family);
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
destroy_tree(&data->root[i]);
@@ -605,15 +673,11 @@ static int __init nf_conncount_modinit(void)
for (i = 0; i < CONNCOUNT_SLOTS; ++i)
spin_lock_init(&nf_conncount_locks[i]);
- conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",
- sizeof(struct nf_conncount_tuple),
- 0, 0, NULL);
+ conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0);
if (!conncount_conn_cachep)
return -ENOMEM;
- conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",
- sizeof(struct nf_conncount_rb),
- 0, 0, NULL);
+ conncount_rb_cachep = KMEM_CACHE(nf_conncount_rb, 0);
if (!conncount_rb_cachep) {
kmem_cache_destroy(conncount_conn_cachep);
return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
index d011d2eb0848..7be4c35e4795 100644
--- a/net/netfilter/nf_conntrack_amanda.c
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -106,7 +106,7 @@ static int amanda_help(struct sk_buff *skb,
/* increase the UDP timeout of the master connection as replies from
* Amanda clients to the server can be quite delayed */
- nf_ct_refresh(ct, skb, master_timeout * HZ);
+ nf_ct_refresh(ct, master_timeout * HZ);
/* No data? */
dataoff = protoff + sizeof(struct udphdr);
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 24002bc61e07..4a136fc3a9c0 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/btf_ids.h>
#include <linux/net_namespace.h>
+#include <net/xdp.h>
#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -31,7 +32,9 @@
* -EINVAL - Passed NULL for bpf_tuple pointer
* -EINVAL - opts->reserved is not 0
* -EINVAL - netns_id is less than -1
- * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12)
+ * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12
+ * -EINVAL - opts->ct_zone_id set when
+ opts__sz isn't NF_BPF_CT_OPTS_SZ (16)
* -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
* -ENONET - No network namespace found for netns_id
* -ENOENT - Conntrack lookup could not find entry for tuple
@@ -41,6 +44,8 @@
* Values:
* IPPROTO_TCP, IPPROTO_UDP
* @dir: - connection tracking tuple direction.
+ * @ct_zone_id - connection tracking zone id.
+ * @ct_zone_dir - connection tracking zone direction.
* @reserved - Reserved member, will be reused for more options in future
* Values:
* 0
@@ -50,11 +55,13 @@ struct bpf_ct_opts {
s32 error;
u8 l4proto;
u8 dir;
- u8 reserved[2];
+ u16 ct_zone_id;
+ u8 ct_zone_dir;
+ u8 reserved[3];
};
enum {
- NF_BPF_CT_OPTS_SZ = 12,
+ NF_BPF_CT_OPTS_SZ = 16,
};
static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
@@ -103,12 +110,21 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
u32 timeout)
{
struct nf_conntrack_tuple otuple, rtuple;
+ struct nf_conntrack_zone ct_zone;
struct nf_conn *ct;
int err;
- if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
- opts_len != NF_BPF_CT_OPTS_SZ)
+ if (!opts || !bpf_tuple)
return ERR_PTR(-EINVAL);
+ if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
+ return ERR_PTR(-EINVAL);
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2])
+ return ERR_PTR(-EINVAL);
+ } else {
+ if (opts->ct_zone_id)
+ return ERR_PTR(-EINVAL);
+ }
if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
return ERR_PTR(-EINVAL);
@@ -129,7 +145,16 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
return ERR_PTR(-ENONET);
}
- ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->ct_zone_dir == 0)
+ opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR;
+ nf_ct_zone_init(&ct_zone,
+ opts->ct_zone_id, opts->ct_zone_dir, 0);
+ } else {
+ ct_zone = nf_ct_zone_dflt;
+ }
+
+ ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple,
GFP_ATOMIC);
if (IS_ERR(ct))
goto out;
@@ -151,12 +176,21 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
{
struct nf_conntrack_tuple_hash *hash;
struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_zone ct_zone;
struct nf_conn *ct;
int err;
- if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
- opts_len != NF_BPF_CT_OPTS_SZ)
+ if (!opts || !bpf_tuple)
return ERR_PTR(-EINVAL);
+ if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
+ return ERR_PTR(-EINVAL);
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2])
+ return ERR_PTR(-EINVAL);
+ } else {
+ if (opts->ct_zone_id)
+ return ERR_PTR(-EINVAL);
+ }
if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP))
return ERR_PTR(-EPROTO);
if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
@@ -173,7 +207,16 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
return ERR_PTR(-ENONET);
}
- hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->ct_zone_dir == 0)
+ opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR;
+ nf_ct_zone_init(&ct_zone,
+ opts->ct_zone_id, opts->ct_zone_dir, 0);
+ } else {
+ ct_zone = nf_ct_zone_dflt;
+ }
+
+ hash = nf_conntrack_find_get(net, &ct_zone, &tuple);
if (opts->netns_id >= 0)
put_net(net);
if (!hash)
@@ -192,8 +235,7 @@ BTF_ID(struct, nf_conn___init)
/* Check writes into `struct nf_conn` */
static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
- int off, int size, enum bpf_access_type atype,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ int off, int size)
{
const struct btf_type *ncit, *nct, *t;
size_t end;
@@ -230,9 +272,7 @@ static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
return 0;
}
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
- "Global functions as their definitions will be in nf_conntrack BTF");
+__bpf_kfunc_start_defs();
/* bpf_xdp_ct_alloc - Allocate a new CT entry
*
@@ -247,9 +287,9 @@ __diag_ignore_all("-Wmissing-prototypes",
* @opts - Additional options for allocation (documented above)
* Cannot be NULL
* @opts__sz - Length of the bpf_ct_opts structure
- * Must be NF_BPF_CT_OPTS_SZ (12)
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
*/
-struct nf_conn___init *
+__bpf_kfunc struct nf_conn___init *
bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
{
@@ -281,9 +321,9 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
* @opts - Additional options for lookup (documented above)
* Cannot be NULL
* @opts__sz - Length of the bpf_ct_opts structure
- * Must be NF_BPF_CT_OPTS_SZ (12)
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
*/
-struct nf_conn *
+__bpf_kfunc struct nf_conn *
bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
{
@@ -314,9 +354,9 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
* @opts - Additional options for allocation (documented above)
* Cannot be NULL
* @opts__sz - Length of the bpf_ct_opts structure
- * Must be NF_BPF_CT_OPTS_SZ (12)
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
*/
-struct nf_conn___init *
+__bpf_kfunc struct nf_conn___init *
bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
{
@@ -349,9 +389,9 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
* @opts - Additional options for lookup (documented above)
* Cannot be NULL
* @opts__sz - Length of the bpf_ct_opts structure
- * Must be NF_BPF_CT_OPTS_SZ (12)
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
*/
-struct nf_conn *
+__bpf_kfunc struct nf_conn *
bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
{
@@ -376,11 +416,13 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
* @nfct - Pointer to referenced nf_conn___init object, obtained
* using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
*/
-struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
+__bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
{
struct nf_conn *nfct = (struct nf_conn *)nfct_i;
int err;
+ if (!nf_ct_is_confirmed(nfct))
+ nfct->timeout += nfct_time_stamp;
nfct->status |= IPS_CONFIRMED;
err = nf_conntrack_hash_check_insert(nfct);
if (err < 0) {
@@ -400,10 +442,8 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
* @nf_conn - Pointer to referenced nf_conn object, obtained using
* bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
*/
-void bpf_ct_release(struct nf_conn *nfct)
+__bpf_kfunc void bpf_ct_release(struct nf_conn *nfct)
{
- if (!nfct)
- return;
nf_ct_put(nfct);
}
@@ -417,7 +457,7 @@ void bpf_ct_release(struct nf_conn *nfct)
* bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
* @timeout - Timeout in msecs.
*/
-void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
+__bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
{
__nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
}
@@ -432,7 +472,7 @@ void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
* bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
* @timeout - New timeout in msecs.
*/
-int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
+__bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
{
return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
}
@@ -447,7 +487,7 @@ int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
* bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
* @status - New status value.
*/
-int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
+__bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
{
return nf_ct_change_status_common((struct nf_conn *)nfct, status);
}
@@ -462,14 +502,14 @@ int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
* bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
* @status - New status value.
*/
-int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
+__bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
{
return nf_ct_change_status_common(nfct, status);
}
-__diag_pop()
+__bpf_kfunc_end_defs();
-BTF_SET8_START(nf_ct_kfunc_set)
+BTF_KFUNCS_START(nf_ct_kfunc_set)
BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
@@ -480,7 +520,7 @@ BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
-BTF_SET8_END(nf_ct_kfunc_set)
+BTF_KFUNCS_END(nf_ct_kfunc_set)
static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
.owner = THIS_MODULE,
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index 9fb9b8031298..a7552a46d6ac 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -75,10 +75,11 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
nf_ct_expect_related(exp, 0);
nf_ct_expect_put(exp);
- nf_ct_refresh(ct, skb, timeout * HZ);
+ nf_ct_refresh(ct, timeout * HZ);
out:
return NF_ACCEPT;
}
EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Broadcast connection tracking helper");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 496c4920505b..0b95f226f211 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -96,8 +96,8 @@ static DEFINE_MUTEX(nf_conntrack_mutex);
#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
#define GC_SCAN_EXPIRED_MAX (64000u / HZ)
-#define MIN_CHAINLEN 8u
-#define MAX_CHAINLEN (32u - MIN_CHAINLEN)
+#define MIN_CHAINLEN 50u
+#define MAX_CHAINLEN (80u - MIN_CHAINLEN)
static struct conntrack_gc_work conntrack_gc_work;
@@ -136,8 +136,8 @@ static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
}
/* return true if we need to recompute hashes (in case hash table was resized) */
-static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
- unsigned int h2, unsigned int sequence)
+static bool nf_conntrack_double_lock(unsigned int h1, unsigned int h2,
+ unsigned int sequence)
{
h1 %= CONNTRACK_LOCKS;
h2 %= CONNTRACK_LOCKS;
@@ -211,24 +211,18 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
unsigned int zoneid,
const struct net *net)
{
- u64 a, b, c, d;
+ siphash_key_t key;
get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
- /* The direction must be ignored, handle usable tuplehash members manually */
- a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3];
- b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3];
+ key = nf_conntrack_hash_rnd;
- c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16;
- c |= tuple->dst.protonum;
+ key.key[0] ^= zoneid;
+ key.key[1] ^= net_hash_mix(net);
- d = (u64)zoneid << 32 | net_hash_mix(net);
-
- /* IPv4: u3.all[1,2,3] == 0 */
- c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2];
- d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2];
-
- return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd);
+ return siphash((void *)tuple,
+ offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend),
+ &key);
}
static u32 scale_hash(u32 hash)
@@ -335,9 +329,6 @@ nf_ct_get_tuple(const struct sk_buff *skb,
#ifdef CONFIG_NF_CT_PROTO_SCTP
case IPPROTO_SCTP:
#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- case IPPROTO_DCCP:
-#endif
/* fallthrough */
return nf_ct_get_tuple_ports(skb, dataoff, tuple);
default:
@@ -511,10 +502,14 @@ u32 nf_ct_get_id(const struct nf_conn *ct)
}
EXPORT_SYMBOL_GPL(nf_ct_get_id);
+static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct)
+{
+ return nf_ct_get_id(nf_ct_to_nf_conn(nfct));
+}
+
static void
clean_from_lists(struct nf_conn *ct)
{
- pr_debug("clean_from_lists(%p)\n", ct);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
@@ -538,10 +533,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
p = tmpl;
tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
- if (tmpl != p) {
- tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
+ if (tmpl != p)
tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
- }
} else {
tmpl = kzalloc(sizeof(*tmpl), flags);
if (!tmpl)
@@ -582,7 +575,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
- pr_debug("%s(%p)\n", __func__, ct);
WARN_ON(refcount_read(&nfct->use) != 0);
if (unlikely(nf_ct_is_template(ct))) {
@@ -603,7 +595,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct)
if (ct->master)
nf_ct_put(ct->master);
- pr_debug("%s: returning ct=%p to slab\n", __func__, ct);
nf_conntrack_free(ct);
}
EXPORT_SYMBOL(nf_ct_destroy);
@@ -622,7 +613,7 @@ static void __nf_ct_delete_from_lists(struct nf_conn *ct)
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
- } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+ } while (nf_conntrack_double_lock(hash, reply_hash, sequence));
clean_from_lists(ct);
nf_conntrack_double_unlock(hash, reply_hash);
@@ -786,8 +777,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
- rcu_read_lock();
-
h = ____nf_conntrack_find(net, zone, tuple, hash);
if (h) {
/* We have a candidate that matches the tuple we're interested
@@ -799,7 +788,7 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
smp_acquire__after_ctrl_dep();
if (likely(nf_ct_key_equal(h, tuple, zone, net)))
- goto found;
+ return h;
/* TYPESAFE_BY_RCU recycled the candidate */
nf_ct_put(ct);
@@ -807,8 +796,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
h = NULL;
}
-found:
- rcu_read_unlock();
return h;
}
@@ -820,16 +807,21 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
struct nf_conntrack_tuple_hash *thash;
+ rcu_read_lock();
+
thash = __nf_conntrack_find_get(net, zone, tuple,
hash_conntrack_raw(tuple, zone_id, net));
if (thash)
- return thash;
+ goto out_unlock;
rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
if (rid != zone_id)
- return __nf_conntrack_find_get(net, zone, tuple,
- hash_conntrack_raw(tuple, rid, net));
+ thash = __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, rid, net));
+
+out_unlock:
+ rcu_read_unlock();
return thash;
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
@@ -886,10 +878,8 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
zone = nf_ct_zone(ct);
- if (!nf_ct_ext_valid_pre(ct->ext)) {
- NF_CT_STAT_INC_ATOMIC(net, insert_failed);
- return -ETIMEDOUT;
- }
+ if (!nf_ct_ext_valid_pre(ct->ext))
+ return -EAGAIN;
local_bh_disable();
do {
@@ -900,7 +890,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
- } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+ } while (nf_conntrack_double_lock(hash, reply_hash, sequence));
max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
@@ -924,6 +914,18 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
goto chaintoolong;
}
+ /* If genid has changed, we can't insert anymore because ct
+ * extensions could have stale pointers and nf_ct_iterate_destroy
+ * might have completed its table scan already.
+ *
+ * Increment of the ext genid right after this check is fine:
+ * nf_ct_iterate_destroy blocks until locks are released.
+ */
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
smp_wmb();
/* The caller holds a reference to this object */
refcount_set(&ct->ct_general.use, 2);
@@ -932,12 +934,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
NF_CT_STAT_INC(net, insert);
local_bh_enable();
- if (!nf_ct_ext_valid_post(ct->ext)) {
- nf_ct_kill(ct);
- NF_CT_STAT_INC_ATOMIC(net, drop);
- return -ETIMEDOUT;
- }
-
return 0;
chaintoolong:
NF_CT_STAT_INC(net, chaintoolong);
@@ -992,6 +988,56 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
tstamp->start = ktime_get_real_ns();
}
+/**
+ * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow
+ * @ct1: conntrack in hash table to check against
+ * @ct2: merge candidate
+ *
+ * returns true if ct1 and ct2 happen to refer to the same flow, but
+ * in opposing directions, i.e.
+ * ct1: a:b -> c:d
+ * ct2: c:d -> a:b
+ * for both directions. If so, @ct2 should not have been created
+ * as the skb should have been picked up as ESTABLISHED flow.
+ * But ct1 was not yet committed to hash table before skb that created
+ * ct2 had arrived.
+ *
+ * Note we don't compare netns because ct entries in different net
+ * namespace cannot clash to begin with.
+ *
+ * @return: true if ct1 and ct2 are identical when swapping origin/reply.
+ */
+static bool
+nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2)
+{
+ u16 id1, id2;
+
+ if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ct2->tuplehash[IP_CT_DIR_REPLY].tuple))
+ return false;
+
+ if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
+ return false;
+
+ id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL);
+ id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY);
+ if (id1 != id2)
+ return false;
+
+ id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY);
+ id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL);
+
+ return id1 == id2;
+}
+
+static int nf_ct_can_merge(const struct nf_conn *ct,
+ const struct nf_conn *loser_ct)
+{
+ return nf_ct_match(ct, loser_ct) ||
+ nf_ct_match_reverse(ct, loser_ct);
+}
+
/* caller must hold locks to prevent concurrent changes */
static int __nf_ct_resolve_clash(struct sk_buff *skb,
struct nf_conntrack_tuple_hash *h)
@@ -1003,11 +1049,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,
loser_ct = nf_ct_get(skb, &ctinfo);
- if (nf_ct_is_dying(ct))
- return NF_DROP;
-
- if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
- nf_ct_match(ct, loser_ct)) {
+ if (nf_ct_can_merge(ct, loser_ct)) {
struct net *net = nf_ct_net(ct);
nf_conntrack_get(&ct->ct_general);
@@ -1079,6 +1121,12 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
&nf_conntrack_hash[repl_idx]);
+ /* confirmed bit must be set after hlist add, not before:
+ * loser_ct can still be visible to other cpu due to
+ * SLAB_TYPESAFE_BY_RCU.
+ */
+ smp_mb__before_atomic();
+ set_bit(IPS_CONFIRMED_BIT, &loser_ct->status);
NF_CT_STAT_INC(net, clash_resolve);
return NF_ACCEPT;
@@ -1094,7 +1142,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
* A conntrack entry can be inserted to the connection tracking table
* if there is no existing entry with an identical tuple.
*
- * If there is one, @skb (and the assocated, unconfirmed conntrack) has
+ * If there is one, @skb (and the associated, unconfirmed conntrack) has
* to be dropped. In case @skb is retransmitted, next conntrack lookup
* will find the already-existing entry.
*
@@ -1186,7 +1234,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
- } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+ } while (nf_conntrack_double_lock(hash, reply_hash, sequence));
/* We're not in hash table, and we refuse to set up related
* connections for unconfirmed conns. But packet copies and
@@ -1210,14 +1258,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
goto dying;
}
- pr_debug("Confirming conntrack %p\n", ct);
/* We have to check the DYING flag after unlink to prevent
* a race against nf_ct_get_next_corpse() possibly called from
* user context, else we insert an already 'dead' hash, blocking
* further use of that particular connection -JM.
*/
- ct->status |= IPS_CONFIRMED;
-
if (unlikely(nf_ct_is_dying(ct))) {
NF_CT_STAT_INC(net, insert_failed);
goto dying;
@@ -1249,7 +1294,7 @@ chaintoolong:
}
}
- /* Timer relative to confirmation time, not original
+ /* Timeout is relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout += nfct_time_stamp;
@@ -1257,11 +1302,21 @@ chaintoolong:
__nf_conntrack_insert_prepare(ct);
/* Since the lookup is lockless, hash insertion must be done after
- * starting the timer and setting the CONFIRMED bit. The RCU barriers
- * guarantee that no other CPU can find the conntrack before the above
- * stores are visible.
+ * setting ct->timeout. The RCU barriers guarantee that no other CPU
+ * can find the conntrack before the above stores are visible.
*/
__nf_conntrack_hash_insert(ct, hash, reply_hash);
+
+ /* IPS_CONFIRMED unset means 'ct not (yet) in hash', conntrack lookups
+ * skip entries that lack this bit. This happens when a CPU is looking
+ * at a stale entry that is being recycled due to SLAB_TYPESAFE_BY_RCU
+ * or when another CPU encounters this entry right after the insertion
+ * but before the set-confirm-bit below. This bit must not be set until
+ * after __nf_conntrack_hash_insert().
+ */
+ smp_mb__before_atomic();
+ set_bit(IPS_CONFIRMED_BIT, &ct->status);
+
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
@@ -1292,7 +1347,7 @@ dying:
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
-/* Returns true if a connection correspondings to the tuple (required
+/* Returns true if a connection corresponds to the tuple (required
for NAT). */
int
nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
@@ -1374,9 +1429,6 @@ static unsigned int early_drop_list(struct net *net,
hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
- continue;
-
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
continue;
@@ -1446,11 +1498,12 @@ static bool gc_worker_skip_ct(const struct nf_conn *ct)
static bool gc_worker_can_early_drop(const struct nf_conn *ct)
{
const struct nf_conntrack_l4proto *l4proto;
+ u8 protonum = nf_ct_protonum(ct);
if (!test_bit(IPS_ASSURED_BIT, &ct->status))
return true;
- l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
+ l4proto = nf_ct_l4proto_find(protonum);
if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
return true;
@@ -1505,11 +1558,6 @@ static void gc_worker(struct work_struct *work)
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
- nf_ct_offload_timeout(tmp);
- continue;
- }
-
if (expired_count > GC_SCAN_EXPIRED_MAX) {
rcu_read_unlock();
@@ -1620,12 +1668,16 @@ __nf_conntrack_alloc(struct net *net,
/* We don't want any race condition at early drop stage */
ct_count = atomic_inc_return(&cnet->count);
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (unlikely(ct_count > nf_conntrack_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
atomic_dec(&cnet->count);
- net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
+ if (net == &init_net)
+ net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
+ else
+ net_warn_ratelimited("nf_conntrack: table full in netns %u, dropping packet\n",
+ net->ns.inum);
return ERR_PTR(-ENOMEM);
}
}
@@ -1721,16 +1773,14 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_zone tmp;
struct nf_conntrack_net *cnet;
- if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {
- pr_debug("Can't invert tuple.\n");
+ if (!nf_ct_invert_tuple(&repl_tuple, tuple))
return NULL;
- }
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
hash);
if (IS_ERR(ct))
- return (struct nf_conntrack_tuple_hash *)ct;
+ return ERR_CAST(ct);
if (!nf_ct_add_synproxy(ct, tmpl)) {
nf_conntrack_free(ct);
@@ -1762,10 +1812,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
cnet = nf_ct_pernet(net);
if (cnet->expect_count) {
spin_lock_bh(&nf_conntrack_expect_lock);
- exp = nf_ct_find_expectation(net, zone, tuple);
+ exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl));
if (exp) {
- pr_debug("expectation arrives ct=%p exp=%p\n",
- ct, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &ct->status);
/* exp->master safe, refcnt bumped in nf_ct_find_expectation */
@@ -1829,10 +1877,8 @@ resolve_normal_ct(struct nf_conn *tmpl,
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, state->pf, protonum, state->net,
- &tuple)) {
- pr_debug("Can't get tuple\n");
+ &tuple))
return 0;
- }
/* look for tuple match */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
@@ -1864,17 +1910,15 @@ resolve_normal_ct(struct nf_conn *tmpl,
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
ctinfo = IP_CT_ESTABLISHED_REPLY;
} else {
+ unsigned long status = READ_ONCE(ct->status);
+
/* Once we've had two way comms, always ESTABLISHED. */
- if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- pr_debug("normal packet for %p\n", ct);
+ if (likely(status & IPS_SEEN_REPLY))
ctinfo = IP_CT_ESTABLISHED;
- } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
- pr_debug("related packet for %p\n", ct);
+ else if (status & IPS_EXPECTED)
ctinfo = IP_CT_RELATED;
- } else {
- pr_debug("new packet for %p\n", ct);
+ else
ctinfo = IP_CT_NEW;
- }
}
nf_ct_set(skb, ct, ctinfo);
return 0;
@@ -1953,11 +1997,6 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct,
return nf_conntrack_sctp_packet(ct, skb, dataoff,
ctinfo, state);
#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- case IPPROTO_DCCP:
- return nf_conntrack_dccp_packet(ct, skb, dataoff,
- ctinfo, state);
-#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
case IPPROTO_GRE:
return nf_conntrack_gre_packet(ct, skb, dataoff,
@@ -1988,7 +2027,6 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
/* rcu_read_lock()ed by nf_hook_thresh */
dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
if (dataoff <= 0) {
- pr_debug("not prepared to track yet or error occurred\n");
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
ret = NF_ACCEPT;
goto out;
@@ -2027,7 +2065,6 @@ repeat:
if (ret <= 0) {
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
- pr_debug("nf_conntrack_in: Can't track with proto module\n");
nf_ct_put(ct);
skb->_nfct = 0;
/* Special case: TCP tracker reports an attempt to reopen a
@@ -2038,7 +2075,7 @@ repeat:
goto repeat;
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- if (ret == -NF_DROP)
+ if (ret == NF_DROP)
NF_CT_STAT_INC_ATOMIC(state->net, drop);
ret = -ret;
@@ -2056,31 +2093,11 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_in);
-/* Alter reply tuple (maybe alter helper). This is for NAT, and is
- implicitly racy: see __nf_conntrack_confirm */
-void nf_conntrack_alter_reply(struct nf_conn *ct,
- const struct nf_conntrack_tuple *newreply)
-{
- struct nf_conn_help *help = nfct_help(ct);
-
- /* Should be unconfirmed, so not in hash table yet */
- WARN_ON(nf_ct_is_confirmed(ct));
-
- pr_debug("Altering reply tuple of %p to ", ct);
- nf_ct_dump_tuple(newreply);
-
- ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
- if (ct->master || (help && !hlist_empty(&help->expectations)))
- return;
-}
-EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
-
/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
void __nf_ct_refresh_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb,
u32 extra_jiffies,
- bool do_acct)
+ unsigned int bytes)
{
/* Only update if this is not a fixed timeout */
if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
@@ -2093,8 +2110,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
if (READ_ONCE(ct->timeout) != extra_jiffies)
WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
- if (do_acct)
- nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
+ if (bytes)
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -2186,74 +2203,6 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
nf_conntrack_get(skb_nfct(nskb));
}
-static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
-{
- const struct nf_nat_hook *nat_hook;
- struct nf_conntrack_tuple_hash *h;
- struct nf_conntrack_tuple tuple;
- unsigned int status;
- int dataoff;
- u16 l3num;
- u8 l4num;
-
- l3num = nf_ct_l3num(ct);
-
- dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
- if (dataoff <= 0)
- return -1;
-
- if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
- l4num, net, &tuple))
- return -1;
-
- if (ct->status & IPS_SRC_NAT) {
- memcpy(tuple.src.u3.all,
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
- sizeof(tuple.src.u3.all));
- tuple.src.u.all =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
- }
-
- if (ct->status & IPS_DST_NAT) {
- memcpy(tuple.dst.u3.all,
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
- sizeof(tuple.dst.u3.all));
- tuple.dst.u.all =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
- }
-
- h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
- if (!h)
- return 0;
-
- /* Store status bits of the conntrack that is clashing to re-do NAT
- * mangling according to what it has been done already to this packet.
- */
- status = ct->status;
-
- nf_ct_put(ct);
- ct = nf_ct_tuplehash_to_ctrack(h);
- nf_ct_set(skb, ct, ctinfo);
-
- nat_hook = rcu_dereference(nf_nat_hook);
- if (!nat_hook)
- return 0;
-
- if (status & IPS_SRC_NAT &&
- nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
- IP_CT_DIR_ORIGINAL) == NF_DROP)
- return -1;
-
- if (status & IPS_DST_NAT &&
- nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
- IP_CT_DIR_ORIGINAL) == NF_DROP)
- return -1;
-
- return 0;
-}
-
/* This packet is coming from userspace via nf_queue, complete the packet
* processing after the helper invocation in nf_confirm().
*/
@@ -2266,11 +2215,14 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
help = nfct_help(ct);
if (!help)
- return 0;
+ return NF_ACCEPT;
helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
- return 0;
+ return NF_ACCEPT;
switch (nf_ct_l3num(ct)) {
case NFPROTO_IPV4:
@@ -2285,43 +2237,34 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
&frag_off);
if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
- return 0;
+ return NF_ACCEPT;
break;
}
#endif
default:
- return 0;
+ return NF_ACCEPT;
}
if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
!nf_is_loopback_packet(skb)) {
if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return -1;
+ return NF_DROP;
}
}
/* We've seen it coming out the other side: confirm it */
- return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0;
+ return nf_conntrack_confirm(skb);
}
static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
- int err;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
- return 0;
-
- if (!nf_ct_is_confirmed(ct)) {
- err = __nf_conntrack_update(net, skb, ct, ctinfo);
- if (err < 0)
- return err;
-
- ct = nf_ct_get(skb, &ctinfo);
- }
+ return NF_ACCEPT;
return nf_confirm_cthelper(skb, ct, ctinfo);
}
@@ -2552,7 +2495,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
* netfilter framework. Roll on, two-stage module
* delete...
*/
- synchronize_net();
+ synchronize_rcu_expedited();
i_see_dead_people:
busy = 0;
list_for_each_entry(net, net_exit_list, exit_list) {
@@ -2580,12 +2523,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
struct hlist_nulls_head *hash;
unsigned int nr_slots, i;
- if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
+ if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head)))
return NULL;
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
+ if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head)))
+ return NULL;
+
hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
if (hash && nulls)
@@ -2761,11 +2707,25 @@ err_cachep:
return ret;
}
+static void nf_conntrack_set_closing(struct nf_conntrack *nfct)
+{
+ struct nf_conn *ct = nf_ct_to_nf_conn(nfct);
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ nf_conntrack_tcp_set_closing(ct);
+ break;
+ }
+}
+
static const struct nf_ct_hook nf_conntrack_hook = {
.update = nf_conntrack_update,
.destroy = nf_ct_destroy,
.get_tuple_skb = nf_conntrack_get_tuple_skb,
.attach = nf_conntrack_attach,
+ .set_closing = nf_conntrack_set_closing,
+ .confirm = __nf_conntrack_confirm,
+ .get_id = nf_conntrack_get_id,
};
void nf_conntrack_init_end(void)
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 8698b3424646..81baf2082604 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -162,6 +162,14 @@ static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
return ret;
}
+static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ if (local64_read(&e->timestamp))
+ local64_set(&e->timestamp, ktime_get_real_ns());
+#endif
+}
+
int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
u32 portid, int report)
{
@@ -186,6 +194,8 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
/* This is a resent of a destroy event? If so, skip missed */
missed = e->portid ? 0 : e->missed;
+ nf_ct_ecache_tstamp_refresh(e);
+
ret = __nf_conntrack_eventmask_report(e, events, missed, &item);
if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) {
/* This is a destroy event that has been triggered by a process,
@@ -291,12 +301,24 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
net->ct.ecache_dwork_pending = true;
} else if (state == NFCT_ECACHE_DESTROY_SENT) {
if (!hlist_nulls_empty(&cnet->ecache.dying_list))
- mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
+ mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0);
else
net->ct.ecache_dwork_pending = false;
}
}
+static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ u64 ts = 0;
+
+ if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+ ts = ktime_get_real_ns();
+
+ local64_set(&e->timestamp, ts);
+#endif
+}
+
bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
{
struct net *net = nf_ct_net(ct);
@@ -309,7 +331,7 @@ bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp
break;
return true;
case 2: /* autodetect: no event listener, don't allocate extension. */
- if (!READ_ONCE(net->ct.ctnetlink_has_listener))
+ if (!READ_ONCE(nf_ctnetlink_has_listener))
return true;
fallthrough;
case 1:
@@ -326,6 +348,7 @@ bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp
e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
if (e) {
+ nf_ct_ecache_tstamp_new(ct, e);
e->ctmask = ctmask;
e->expmask = expmask;
}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 96948e98ec53..cfc2daa3fc7f 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
static void nf_ct_expectation_timed_out(struct timer_list *t)
{
- struct nf_conntrack_expect *exp = from_timer(exp, t, timeout);
+ struct nf_conntrack_expect *exp = timer_container_of(exp, t, timeout);
spin_lock_bh(&nf_conntrack_expect_lock);
nf_ct_unlink_expect(exp);
@@ -118,7 +118,7 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple,
bool nf_ct_remove_expect(struct nf_conntrack_expect *exp)
{
- if (del_timer(&exp->timeout)) {
+ if (timer_delete(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
return true;
@@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
struct nf_conntrack_expect *
nf_ct_find_expectation(struct net *net,
const struct nf_conntrack_zone *zone,
- const struct nf_conntrack_tuple *tuple)
+ const struct nf_conntrack_tuple *tuple, bool unlink)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
struct nf_conntrack_expect *i, *exp = NULL;
@@ -211,14 +211,14 @@ nf_ct_find_expectation(struct net *net,
!refcount_inc_not_zero(&exp->master->ct_general.use)))
return NULL;
- if (exp->flags & NF_CT_EXPECT_PERMANENT) {
+ if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) {
refcount_inc(&exp->use);
return exp;
- } else if (del_timer(&exp->timeout)) {
+ } else if (timer_delete(&exp->timeout)) {
nf_ct_unlink_expect(exp);
return exp;
}
- /* Undo exp->master refcnt increase, if del_timer() failed */
+ /* Undo exp->master refcnt increase, if timer_delete() failed */
nf_ct_put(exp->master);
return NULL;
@@ -520,7 +520,7 @@ void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, vo
hlist_for_each_entry_safe(exp, next,
&nf_ct_expect_hash[i],
hnode) {
- if (iter(exp, data) && del_timer(&exp->timeout)) {
+ if (iter(exp, data) && timer_delete(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
}
@@ -550,7 +550,7 @@ void nf_ct_expect_iterate_net(struct net *net,
if (!net_eq(nf_ct_exp_net(exp), net))
continue;
- if (iter(exp, data) && del_timer(&exp->timeout)) {
+ if (iter(exp, data) && timer_delete(&exp->timeout)) {
nf_ct_unlink_expect_report(exp, portid, report);
nf_ct_expect_put(exp);
}
@@ -722,9 +722,7 @@ int nf_conntrack_expect_init(void)
nf_ct_expect_hsize = 1;
}
nf_ct_expect_max = nf_ct_expect_hsize * 4;
- nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
- sizeof(struct nf_conntrack_expect),
- 0, 0, NULL);
+ nf_ct_expect_cachep = KMEM_CACHE(nf_conntrack_expect, 0);
if (!nf_ct_expect_cachep)
return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 0b513f7bf9f3..dd62cc12e775 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -40,10 +40,10 @@ static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = {
[NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache),
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
- [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_acct),
+ [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_tstamp),
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_tstamp),
+ [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_timeout),
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
[NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels),
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index e697a824b001..540d97715bd2 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -533,6 +533,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
/* Get fields bitmap */
if (nf_h323_error_boundary(bs, 0, f->sz))
return H323_ERROR_BOUND;
+ if (f->sz > 32)
+ return H323_ERROR_RANGE;
bmp = get_bitmap(bs, f->sz);
if (base)
*(unsigned int *)base = bmp;
@@ -589,6 +591,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
bmp2_len = get_bits(bs, 7) + 1;
if (nf_h323_error_boundary(bs, 0, bmp2_len))
return H323_ERROR_BOUND;
+ if (bmp2_len > 32)
+ return H323_ERROR_RANGE;
bmp2 = get_bitmap(bs, bmp2_len);
bmp |= bmp2 >> f->sz;
if (base)
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 5a9bce24f3c3..14f73872f647 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -1385,7 +1385,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
if (info->timeout > 0) {
pr_debug("nf_ct_ras: set RAS connection timeout to "
"%u seconds\n", info->timeout);
- nf_ct_refresh(ct, skb, info->timeout * HZ);
+ nf_ct_refresh(ct, info->timeout * HZ);
/* Set expect timeout */
spin_lock_bh(&nf_conntrack_expect_lock);
@@ -1433,7 +1433,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
info->sig_port[!dir] = 0;
/* Give it 30 seconds for UCF or URJ */
- nf_ct_refresh(ct, skb, 30 * HZ);
+ nf_ct_refresh(ct, 30 * HZ);
return 0;
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 48ea6d0264b5..ceb48c3ca0a4 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -194,12 +194,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
struct nf_conntrack_helper *helper = NULL;
struct nf_conn_help *help;
- /* We already got a helper explicitly attached. The function
- * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
- * the helper up again. Since now the user is in full control of
- * making consistent helper configurations, skip this automatic
- * re-lookup, otherwise we'll lose the helper.
- */
+ /* We already got a helper explicitly attached (e.g. nft_ct) */
if (test_bit(IPS_HELPER_BIT, &ct->status))
return 0;
@@ -242,104 +237,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
}
EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
-/* 'skb' should already be pulled to nh_ofs. */
-int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
- enum ip_conntrack_info ctinfo, u16 proto)
-{
- const struct nf_conntrack_helper *helper;
- const struct nf_conn_help *help;
- unsigned int protoff;
- int err;
-
- if (ctinfo == IP_CT_RELATED_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
-
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
-
- if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
- helper->tuple.src.l3num != proto)
- return NF_ACCEPT;
-
- switch (proto) {
- case NFPROTO_IPV4:
- protoff = ip_hdrlen(skb);
- proto = ip_hdr(skb)->protocol;
- break;
- case NFPROTO_IPV6: {
- u8 nexthdr = ipv6_hdr(skb)->nexthdr;
- __be16 frag_off;
- int ofs;
-
- ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- return NF_ACCEPT;
- }
- protoff = ofs;
- proto = nexthdr;
- break;
- }
- default:
- WARN_ONCE(1, "helper invoked on non-IP family!");
- return NF_DROP;
- }
-
- if (helper->tuple.dst.protonum != proto)
- return NF_ACCEPT;
-
- err = helper->help(skb, protoff, ct, ctinfo);
- if (err != NF_ACCEPT)
- return err;
-
- /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
- * FTP with NAT) adusting the TCP payload size when mangling IP
- * addresses and/or port numbers in the text-based control connection.
- */
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
- return NF_DROP;
- return NF_ACCEPT;
-}
-EXPORT_SYMBOL_GPL(nf_ct_helper);
-
-int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
- u8 proto, bool nat, struct nf_conntrack_helper **hp)
-{
- struct nf_conntrack_helper *helper;
- struct nf_conn_help *help;
- int ret = 0;
-
- helper = nf_conntrack_helper_try_module_get(name, family, proto);
- if (!helper)
- return -EINVAL;
-
- help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
- if (!help) {
- nf_conntrack_helper_put(helper);
- return -ENOMEM;
- }
-#if IS_ENABLED(CONFIG_NF_NAT)
- if (nat) {
- ret = nf_nat_helper_try_module_get(name, family, proto);
- if (ret) {
- nf_conntrack_helper_put(helper);
- return ret;
- }
- }
-#endif
- rcu_assign_pointer(help->helper, helper);
- *hp = helper;
- return ret;
-}
-EXPORT_SYMBOL_GPL(nf_ct_add_helper);
-
/* appropriate ct lock protecting must be taken by caller */
static int unhelp(struct nf_conn *ct, void *me)
{
@@ -458,6 +355,9 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1);
+ if (!nf_ct_helper_hash)
+ return -ENOENT;
+
if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT)
return -EINVAL;
@@ -468,7 +368,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
(cur->tuple.src.l3num == NFPROTO_UNSPEC ||
cur->tuple.src.l3num == me->tuple.src.l3num) &&
cur->tuple.dst.protonum == me->tuple.dst.protonum) {
- ret = -EEXIST;
+ ret = -EBUSY;
goto out;
}
}
@@ -479,7 +379,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple,
&mask)) {
- ret = -EEXIST;
+ ret = -EBUSY;
goto out;
}
}
@@ -613,4 +513,5 @@ int nf_conntrack_helper_init(void)
void nf_conntrack_helper_fini(void)
{
kvfree(nf_ct_helper_hash);
+ nf_ct_helper_hash = NULL;
}
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 6e70e137a0a6..6c46aad23313 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -11,8 +11,6 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_labels.h>
-static DEFINE_SPINLOCK(nf_connlabels_lock);
-
static int replace_u32(u32 *address, u32 mask, u32 new)
{
u32 old, tmp;
@@ -60,23 +58,24 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace);
int nf_connlabels_get(struct net *net, unsigned int bits)
{
+ int v;
+
if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long))
return -ERANGE;
- spin_lock(&nf_connlabels_lock);
- net->ct.labels_used++;
- spin_unlock(&nf_connlabels_lock);
-
BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
+ v = atomic_inc_return_relaxed(&net->ct.labels_used);
+ WARN_ON_ONCE(v <= 0);
+
return 0;
}
EXPORT_SYMBOL_GPL(nf_connlabels_get);
void nf_connlabels_put(struct net *net)
{
- spin_lock(&nf_connlabels_lock);
- net->ct.labels_used--;
- spin_unlock(&nf_connlabels_lock);
+ int v = atomic_dec_return_relaxed(&net->ct.labels_used);
+
+ WARN_ON_ONCE(v < 0);
}
EXPORT_SYMBOL_GPL(nf_connlabels_put);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 1286ae7d4609..3a04665adf99 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -57,9 +57,10 @@
#include "nf_internals.h"
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("List and change connection tracking table");
struct ctnetlink_list_dump_ctx {
- struct nf_conn *last;
+ unsigned long last_id;
unsigned int cpu;
bool done;
};
@@ -176,7 +177,12 @@ nla_put_failure:
static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct,
bool skip_zero)
{
- long timeout = nf_ct_expires(ct) / HZ;
+ long timeout;
+
+ if (nf_ct_is_confirmed(ct))
+ timeout = nf_ct_expires(ct) / HZ;
+ else
+ timeout = ct->timeout / HZ;
if (skip_zero && timeout == 0)
return 0;
@@ -328,11 +334,12 @@ nla_put_failure:
}
#ifdef CONFIG_NF_CONNTRACK_MARK
-static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct,
+ bool dump)
{
u32 mark = READ_ONCE(ct->mark);
- if (!mark)
+ if (!mark && !dump)
return 0;
if (nla_put_be32(skb, CTA_MARK, htonl(mark)))
@@ -343,18 +350,18 @@ nla_put_failure:
return -1;
}
#else
-#define ctnetlink_dump_mark(a, b) (0)
+#define ctnetlink_dump_mark(a, b, c) (0)
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_secctx;
- int len, ret;
- char *secctx;
+ struct lsm_context ctx;
+ int ret;
- ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, &ctx);
+ if (ret < 0)
return 0;
ret = -1;
@@ -362,20 +369,37 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
if (!nest_secctx)
goto nla_put_failure;
- if (nla_put_string(skb, CTA_SECCTX_NAME, secctx))
+ if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context))
goto nla_put_failure;
nla_nest_end(skb, nest_secctx);
ret = 0;
nla_put_failure:
- security_release_secctx(secctx, len);
+ security_release_secctx(&ctx);
return ret;
}
#else
#define ctnetlink_dump_secctx(a, b) (0)
#endif
-#ifdef CONFIG_NF_CONNTRACK_LABELS
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int
+ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct);
+
+ if (e) {
+ u64 ts = local64_read(&e->timestamp);
+
+ if (ts)
+ return nla_put_be64(skb, CTA_TIMESTAMP_EVENT,
+ cpu_to_be64(ts), CTA_TIMESTAMP_PAD);
+ }
+#endif
+ return 0;
+}
+
static inline int ctnetlink_label_size(const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
@@ -384,6 +408,7 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct)
return 0;
return nla_total_size(sizeof(labels->bits));
}
+#endif
static int
ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
@@ -404,10 +429,6 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
return 0;
}
-#else
-#define ctnetlink_dump_labels(a, b) (0)
-#define ctnetlink_label_size(a) (0)
-#endif
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -548,7 +569,7 @@ static int ctnetlink_dump_extinfo(struct sk_buff *skb,
static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
{
if (ctnetlink_dump_status(skb, ct) < 0 ||
- ctnetlink_dump_mark(skb, ct) < 0 ||
+ ctnetlink_dump_mark(skb, ct, true) < 0 ||
ctnetlink_dump_secctx(skb, ct) < 0 ||
ctnetlink_dump_id(skb, ct) < 0 ||
ctnetlink_dump_use(skb, ct) < 0 ||
@@ -645,7 +666,6 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct)
return len + len4;
}
-#endif
static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
{
@@ -660,14 +680,14 @@ static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
static inline int ctnetlink_secctx_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_SECMARK
- int len, ret;
+ int ret;
- ret = security_secid_to_secctx(ct->secmark, NULL, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, NULL);
+ if (ret < 0)
return 0;
return nla_total_size(0) /* CTA_SECCTX */
- + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */
+ + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */
#else
return 0;
#endif
@@ -683,6 +703,7 @@ static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct)
return 0;
#endif
}
+#endif
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
@@ -713,6 +734,9 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
#endif
+ ctnetlink_proto_size(ct)
+ ctnetlink_label_size(ct)
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */
+#endif
;
}
@@ -831,10 +855,13 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
}
#ifdef CONFIG_NF_CONNTRACK_MARK
- if (events & (1 << IPCT_MARK) &&
- ctnetlink_dump_mark(skb, ct) < 0)
+ if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK)))
goto nla_put_failure;
#endif
+
+ if (ctnetlink_dump_event_timestamp(skb, ct))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
err = nfnetlink_send(skb, net, item->portid, group, item->report,
GFP_ATOMIC);
@@ -857,8 +884,6 @@ errout:
static int ctnetlink_done(struct netlink_callback *cb)
{
- if (cb->args[1])
- nf_ct_put((struct nf_conn *)cb->args[1]);
kfree(cb->data);
return 0;
}
@@ -870,6 +895,7 @@ struct ctnetlink_filter_u32 {
struct ctnetlink_filter {
u8 family;
+ bool zone_filter;
u_int32_t orig_flags;
u_int32_t reply_flags;
@@ -986,13 +1012,16 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
if (err)
goto err_filter;
+ if (cda[CTA_ZONE]) {
+ err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone);
+ if (err < 0)
+ goto err_filter;
+ filter->zone_filter = true;
+ }
+
if (!cda[CTA_FILTER])
return filter;
- err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone);
- if (err < 0)
- goto err_filter;
-
err = ctnetlink_parse_filter(cda[CTA_FILTER], filter);
if (err < 0)
goto err_filter;
@@ -1037,7 +1066,7 @@ err_filter:
static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda)
{
- return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS];
+ return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE];
}
static int ctnetlink_start(struct netlink_callback *cb)
@@ -1142,6 +1171,10 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
if (filter->family && nf_ct_l3num(ct) != filter->family)
goto ignore_entry;
+ if (filter->zone_filter &&
+ !nf_ct_zone_equal_any(ct, &filter->zone))
+ goto ignore_entry;
+
if (filter->orig_flags) {
tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL);
if (!ctnetlink_filter_match_tuple(&filter->orig, tuple,
@@ -1173,19 +1206,26 @@ ignore_entry:
return 0;
}
+static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
+{
+ unsigned long id = nf_ct_get_id(ct);
+
+ return id ? id : 1;
+}
+
static int
ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
struct net *net = sock_net(skb->sk);
- struct nf_conn *ct, *last;
+ unsigned long last_id = cb->args[1];
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
struct nf_conn *nf_ct_evict[8];
+ struct nf_conn *ct;
int res, i;
spinlock_t *lockp;
- last = (struct nf_conn *)cb->args[1];
i = 0;
local_bh_disable();
@@ -1222,7 +1262,7 @@ restart:
continue;
if (cb->args[1]) {
- if (ct != last)
+ if (ctnetlink_get_id(ct) != last_id)
continue;
cb->args[1] = 0;
}
@@ -1235,8 +1275,7 @@ restart:
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
ct, true, flags);
if (res < 0) {
- nf_conntrack_get(&ct->ct_general);
- cb->args[1] = (unsigned long)ct;
+ cb->args[1] = ctnetlink_get_id(ct);
spin_unlock(lockp);
goto out;
}
@@ -1249,12 +1288,10 @@ restart:
}
out:
local_bh_enable();
- if (last) {
+ if (last_id) {
/* nf ct hash resize happened, now clear the leftover. */
- if ((struct nf_conn *)cb->args[1] == last)
+ if (cb->args[1] == last_id)
cb->args[1] = 0;
-
- nf_ct_put(last);
}
while (i) {
@@ -1316,15 +1353,11 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
struct nlattr *tb[CTA_IP_MAX+1];
int ret = 0;
- ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, NULL, NULL);
+ ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr,
+ cta_ip_nla_policy, NULL);
if (ret < 0)
return ret;
- ret = nla_validate_nested_deprecated(attr, CTA_IP_MAX,
- cta_ip_nla_policy, NULL);
- if (ret)
- return ret;
-
switch (tuple->src.l3num) {
case NFPROTO_IPV4:
ret = ipv4_nlattr_to_tuple(tb, tuple, flags);
@@ -1550,13 +1583,11 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
.len = NF_CT_LABELS_MAX_SIZE },
[CTA_FILTER] = { .type = NLA_NESTED },
[CTA_STATUS_MASK] = { .type = NLA_U32 },
+ [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT },
};
static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
{
- if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
- return 0;
-
return ctnetlink_filter_match(ct, data);
}
@@ -1572,9 +1603,6 @@ static int ctnetlink_flush_conntrack(struct net *net,
};
if (ctnetlink_needs_filter(family, cda)) {
- if (cda[CTA_FILTER])
- return -EOPNOTSUPP;
-
filter = ctnetlink_alloc_filter(cda, family);
if (IS_ERR(filter))
return PTR_ERR(filter);
@@ -1603,14 +1631,14 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb,
if (err < 0)
return err;
- if (cda[CTA_TUPLE_ORIG])
+ if (cda[CTA_TUPLE_ORIG] && !cda[CTA_FILTER])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
family, &zone);
- else if (cda[CTA_TUPLE_REPLY])
+ else if (cda[CTA_TUPLE_REPLY] && !cda[CTA_FILTER])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
family, &zone);
else {
- u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC;
+ u8 u3 = info->nfmsg->version || cda[CTA_FILTER] ? family : AF_UNSPEC;
return ctnetlink_flush_conntrack(info->net, cda,
NETLINK_CB(skb).portid,
@@ -1626,11 +1654,6 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb,
ct = nf_ct_tuplehash_to_ctrack(h);
- if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
- nf_ct_put(ct);
- return -EBUSY;
- }
-
if (cda[CTA_ID]) {
__be32 id = nla_get_be32(cda[CTA_ID]);
@@ -1710,16 +1733,6 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb,
return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
-static int ctnetlink_done_list(struct netlink_callback *cb)
-{
- struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
-
- if (ctx->last)
- nf_ct_put(ctx->last);
-
- return 0;
-}
-
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int ctnetlink_dump_one_entry(struct sk_buff *skb,
struct netlink_callback *cb,
@@ -1734,11 +1747,11 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb,
if (l3proto && nf_ct_l3num(ct) != l3proto)
return 0;
- if (ctx->last) {
- if (ct != ctx->last)
+ if (ctx->last_id) {
+ if (ctnetlink_get_id(ct) != ctx->last_id)
return 0;
- ctx->last = NULL;
+ ctx->last_id = 0;
}
/* We can't dump extension info for the unconfirmed
@@ -1752,12 +1765,8 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb,
cb->nlh->nlmsg_seq,
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
ct, dying, 0);
- if (res < 0) {
- if (!refcount_inc_not_zero(&ct->ct_general.use))
- return 0;
-
- ctx->last = ct;
- }
+ if (res < 0)
+ ctx->last_id = ctnetlink_get_id(ct);
return res;
}
@@ -1773,10 +1782,10 @@ static int
ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
- struct nf_conn *last = ctx->last;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
const struct net *net = sock_net(skb->sk);
struct nf_conntrack_net_ecache *ecache_net;
+ unsigned long last_id = ctx->last_id;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
#endif
@@ -1784,7 +1793,7 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
if (ctx->done)
return 0;
- ctx->last = NULL;
+ ctx->last_id = 0;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
ecache_net = nf_conn_pernet_ecache(net);
@@ -1795,24 +1804,21 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
int res;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (last && last != ct)
+ if (last_id && last_id != ctnetlink_get_id(ct))
continue;
res = ctnetlink_dump_one_entry(skb, cb, ct, true);
if (res < 0) {
spin_unlock_bh(&ecache_net->dying_lock);
- nf_ct_put(last);
return skb->len;
}
- nf_ct_put(last);
- last = NULL;
+ last_id = 0;
}
spin_unlock_bh(&ecache_net->dying_lock);
#endif
ctx->done = true;
- nf_ct_put(last);
return skb->len;
}
@@ -1824,7 +1830,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb,
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_dying,
- .done = ctnetlink_done_list,
};
return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
@@ -1839,7 +1844,6 @@ static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_unconfirmed,
- .done = ctnetlink_done_list,
};
return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
@@ -2015,7 +2019,6 @@ static void ctnetlink_change_mark(struct nf_conn *ct,
static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
[CTA_PROTOINFO_TCP] = { .type = NLA_NESTED },
- [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED },
[CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED },
};
@@ -2253,9 +2256,6 @@ ctnetlink_create_conntrack(struct net *net,
if (!cda[CTA_TIMEOUT])
goto err1;
- timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
- __nf_ct_set_timeout(ct, timeout);
-
rcu_read_lock();
if (cda[CTA_HELP]) {
char *helpname = NULL;
@@ -2319,6 +2319,9 @@ ctnetlink_create_conntrack(struct net *net,
/* we must add conntrack extensions before confirmation. */
ct->status |= IPS_CONFIRMED;
+ timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
+ __nf_ct_set_timeout(ct, timeout);
+
if (cda[CTA_STATUS]) {
err = ctnetlink_change_status(ct, cda);
if (err < 0)
@@ -2375,12 +2378,15 @@ ctnetlink_create_conntrack(struct net *net,
err = nf_conntrack_hash_check_insert(ct);
if (err < 0)
- goto err2;
+ goto err3;
rcu_read_unlock();
return ct;
+err3:
+ if (ct->master)
+ nf_ct_put(ct->master);
err2:
rcu_read_unlock();
err1:
@@ -2735,7 +2741,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
goto nla_put_failure;
#ifdef CONFIG_NF_CONNTRACK_MARK
- if (ctnetlink_dump_mark(skb, ct) < 0)
+ if (ctnetlink_dump_mark(skb, ct, true) < 0)
goto nla_put_failure;
#endif
if (ctnetlink_dump_labels(skb, ct) < 0)
@@ -2976,7 +2982,9 @@ nla_put_failure:
return -1;
}
+#if IS_ENABLED(CONFIG_NF_NAT)
static const union nf_inet_addr any_addr;
+#endif
static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp)
{
@@ -3143,23 +3151,27 @@ errout:
return 0;
}
#endif
-static int ctnetlink_exp_done(struct netlink_callback *cb)
+
+static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp)
{
- if (cb->args[1])
- nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
- return 0;
+ unsigned long id = (unsigned long)exp;
+
+ id += nf_ct_get_id(exp->master);
+ id += exp->class;
+
+ return id ? id : 1;
}
static int
ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
- struct nf_conntrack_expect *exp, *last;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family;
+ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_expect *exp;
rcu_read_lock();
- last = (struct nf_conntrack_expect *)cb->args[1];
for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
@@ -3171,7 +3183,7 @@ restart:
continue;
if (cb->args[1]) {
- if (exp != last)
+ if (ctnetlink_exp_id(exp) != last_id)
continue;
cb->args[1] = 0;
}
@@ -3180,9 +3192,7 @@ restart:
cb->nlh->nlmsg_seq,
IPCTNL_MSG_EXP_NEW,
exp) < 0) {
- if (!refcount_inc_not_zero(&exp->use))
- continue;
- cb->args[1] = (unsigned long)exp;
+ cb->args[1] = ctnetlink_exp_id(exp);
goto out;
}
}
@@ -3193,32 +3203,30 @@ restart:
}
out:
rcu_read_unlock();
- if (last)
- nf_ct_expect_put(last);
-
return skb->len;
}
static int
ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nf_conntrack_expect *exp, *last;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
struct nf_conn *ct = cb->data;
struct nf_conn_help *help = nfct_help(ct);
u_int8_t l3proto = nfmsg->nfgen_family;
+ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_expect *exp;
if (cb->args[0])
return 0;
rcu_read_lock();
- last = (struct nf_conntrack_expect *)cb->args[1];
+
restart:
hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
if (cb->args[1]) {
- if (exp != last)
+ if (ctnetlink_exp_id(exp) != last_id)
continue;
cb->args[1] = 0;
}
@@ -3226,9 +3234,7 @@ restart:
cb->nlh->nlmsg_seq,
IPCTNL_MSG_EXP_NEW,
exp) < 0) {
- if (!refcount_inc_not_zero(&exp->use))
- continue;
- cb->args[1] = (unsigned long)exp;
+ cb->args[1] = ctnetlink_exp_id(exp);
goto out;
}
}
@@ -3239,9 +3245,6 @@ restart:
cb->args[0] = 1;
out:
rcu_read_unlock();
- if (last)
- nf_ct_expect_put(last);
-
return skb->len;
}
@@ -3260,7 +3263,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
struct nf_conntrack_zone zone;
struct netlink_dump_control c = {
.dump = ctnetlink_exp_ct_dump_table,
- .done = ctnetlink_exp_done,
};
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
@@ -3310,7 +3312,6 @@ static int ctnetlink_get_expect(struct sk_buff *skb,
else {
struct netlink_dump_control c = {
.dump = ctnetlink_exp_dump_table,
- .done = ctnetlink_exp_done,
};
return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
@@ -3413,7 +3414,8 @@ static int ctnetlink_del_expect(struct sk_buff *skb,
if (cda[CTA_EXPECT_ID]) {
__be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
- if (ntohl(id) != (u32)(unsigned long)exp) {
+
+ if (id != nf_expect_get_id(exp)) {
nf_ct_expect_put(exp);
return -ENOENT;
}
@@ -3421,7 +3423,7 @@ static int ctnetlink_del_expect(struct sk_buff *skb,
/* after list removal, usage count == 1 */
spin_lock_bh(&nf_conntrack_expect_lock);
- if (del_timer(&exp->timeout)) {
+ if (timer_delete(&exp->timeout)) {
nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
nlmsg_report(info->nlh));
nf_ct_expect_put(exp);
@@ -3450,7 +3452,7 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x,
const struct nlattr * const cda[])
{
if (cda[CTA_EXPECT_TIMEOUT]) {
- if (!del_timer(&x->timeout))
+ if (!timer_delete(&x->timeout))
return -ETIME;
x->timeout.expires = jiffies +
@@ -3460,10 +3462,12 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x,
return 0;
}
+#if IS_ENABLED(CONFIG_NF_NAT)
static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = {
[CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 },
[CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED },
};
+#endif
static int
ctnetlink_parse_expect_nat(const struct nlattr *attr,
@@ -3866,7 +3870,7 @@ static int __init ctnetlink_init(void)
{
int ret;
- BUILD_BUG_ON(sizeof(struct ctnetlink_list_dump_ctx) > sizeof_field(struct netlink_callback, ctx));
+ NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx);
ret = nfnetlink_subsys_register(&ctnl_subsys);
if (ret < 0) {
diff --git a/net/netfilter/nf_conntrack_ovs.c b/net/netfilter/nf_conntrack_ovs.c
new file mode 100644
index 000000000000..068e9489e1c2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ovs.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Support ct functions for openvswitch and used by OVS and TC conntrack. */
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/ipv6_frag.h>
+#include <net/ip.h>
+#include <linux/netfilter_ipv6.h>
+
+/* 'skb' should already be pulled to nh_ofs. */
+int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, u16 proto)
+{
+ const struct nf_conntrack_helper *helper;
+ const struct nf_conn_help *help;
+ unsigned int protoff;
+ int err;
+
+ if (ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
+ helper->tuple.src.l3num != proto)
+ return NF_ACCEPT;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ protoff = ip_hdrlen(skb);
+ proto = ip_hdr(skb)->protocol;
+ break;
+ case NFPROTO_IPV6: {
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
+ int ofs;
+
+ ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ return NF_ACCEPT;
+ }
+ protoff = ofs;
+ proto = nexthdr;
+ break;
+ }
+ default:
+ WARN_ONCE(1, "helper invoked on non-IP family!");
+ return NF_DROP;
+ }
+
+ if (helper->tuple.dst.protonum != proto)
+ return NF_ACCEPT;
+
+ err = helper->help(skb, protoff, ct, ctinfo);
+ if (err != NF_ACCEPT)
+ return err;
+
+ /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
+ * FTP with NAT) adusting the TCP payload size when mangling IP
+ * addresses and/or port numbers in the text-based control connection.
+ */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper);
+
+int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
+ u8 proto, bool nat, struct nf_conntrack_helper **hp)
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_help *help;
+ int ret = 0;
+
+ helper = nf_conntrack_helper_try_module_get(name, family, proto);
+ if (!helper)
+ return -EINVAL;
+
+ help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
+ if (!help) {
+ nf_conntrack_helper_put(helper);
+ return -ENOMEM;
+ }
+#if IS_ENABLED(CONFIG_NF_NAT)
+ if (nat) {
+ ret = nf_nat_helper_try_module_get(name, family, proto);
+ if (ret) {
+ nf_conntrack_helper_put(helper);
+ return ret;
+ }
+ }
+#endif
+ rcu_assign_pointer(help->helper, helper);
+ *hp = helper;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_add_helper);
+
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+int nf_ct_skb_network_trim(struct sk_buff *skb, int family)
+{
+ unsigned int len;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ len = skb_ip_totlen(skb);
+ break;
+ case NFPROTO_IPV6:
+ len = ntohs(ipv6_hdr(skb)->payload_len);
+ if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) {
+ int err = nf_ip6_check_hbh_len(skb, &len);
+
+ if (err)
+ return err;
+ }
+ len += sizeof(struct ipv6hdr);
+ break;
+ default:
+ len = skb->len;
+ }
+
+ return pskb_trim_rcsum(skb, len);
+}
+EXPORT_SYMBOL_GPL(nf_ct_skb_network_trim);
+
+/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
+ * value if 'skb' is freed.
+ */
+int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+ u16 zone, u8 family, u8 *proto, u16 *mru)
+{
+ int err;
+
+ if (family == NFPROTO_IPV4) {
+ enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(net, skb, user);
+ local_bh_enable();
+ if (err)
+ return err;
+
+ *mru = IPCB(skb)->frag_max_size;
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ } else if (family == NFPROTO_IPV6) {
+ enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+ err = nf_ct_frag6_gather(net, skb, user);
+ if (err) {
+ if (err != -EINPROGRESS)
+ kfree_skb(skb);
+ return err;
+ }
+
+ *proto = ipv6_hdr(skb)->nexthdr;
+ *mru = IP6CB(skb)->frag_max_size;
+#endif
+ } else {
+ kfree_skb(skb);
+ return -EPFNOSUPPORT;
+ }
+
+ skb_clear_hash(skb);
+ skb->ignore_df = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_handle_fragments);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index ccef340be575..bc1d96686b9c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -100,9 +100,6 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
case IPPROTO_UDP: return &nf_conntrack_l4proto_udp;
case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp;
case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp;
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- case IPPROTO_DCCP: return &nf_conntrack_l4proto_dccp;
-#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp;
#endif
@@ -284,16 +281,11 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
/* We only do TCP and SCTP at the moment: is there a better way? */
if (tuple.dst.protonum != IPPROTO_TCP &&
- tuple.dst.protonum != IPPROTO_SCTP) {
- pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
+ tuple.dst.protonum != IPPROTO_SCTP)
return -ENOPROTOOPT;
- }
- if ((unsigned int)*len < sizeof(struct sockaddr_in)) {
- pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
- *len, sizeof(struct sockaddr_in));
+ if ((unsigned int)*len < sizeof(struct sockaddr_in))
return -EINVAL;
- }
h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
if (h) {
@@ -307,17 +299,12 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
.tuple.dst.u3.ip;
memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
- pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
nf_ct_put(ct);
if (copy_to_user(user, &sin, sizeof(sin)) != 0)
return -EFAULT;
else
return 0;
}
- pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
- &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
return -ENOENT;
}
@@ -360,12 +347,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
return -EINVAL;
h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
- if (!h) {
- pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
- &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
+ if (!h)
return -ENOENT;
- }
ct = nf_ct_tuplehash_to_ctrack(h);
@@ -695,9 +678,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
#if IS_ENABLED(CONFIG_IPV6)
nf_conntrack_icmpv6_init_net(net);
#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- nf_conntrack_dccp_init_net(net);
-#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
nf_conntrack_sctp_init_net(net);
#endif
@@ -713,3 +693,4 @@ MODULE_ALIAS("ip_conntrack");
MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv4 and IPv6 connection tracking");
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
deleted file mode 100644
index c1557d47ccd1..000000000000
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ /dev/null
@@ -1,778 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * DCCP connection tracking protocol helper
- *
- * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sysctl.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/dccp.h>
-#include <linux/slab.h>
-
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-
-#include <linux/netfilter/nfnetlink_conntrack.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-#include <net/netfilter/nf_conntrack_timeout.h>
-#include <net/netfilter/nf_log.h>
-
-/* Timeouts are based on values from RFC4340:
- *
- * - REQUEST:
- *
- * 8.1.2. Client Request
- *
- * A client MAY give up on its DCCP-Requests after some time
- * (3 minutes, for example).
- *
- * - RESPOND:
- *
- * 8.1.3. Server Response
- *
- * It MAY also leave the RESPOND state for CLOSED after a timeout of
- * not less than 4MSL (8 minutes);
- *
- * - PARTOPEN:
- *
- * 8.1.5. Handshake Completion
- *
- * If the client remains in PARTOPEN for more than 4MSL (8 minutes),
- * it SHOULD reset the connection with Reset Code 2, "Aborted".
- *
- * - OPEN:
- *
- * The DCCP timestamp overflows after 11.9 hours. If the connection
- * stays idle this long the sequence number won't be recognized
- * as valid anymore.
- *
- * - CLOSEREQ/CLOSING:
- *
- * 8.3. Termination
- *
- * The retransmission timer should initially be set to go off in two
- * round-trip times and should back off to not less than once every
- * 64 seconds ...
- *
- * - TIMEWAIT:
- *
- * 4.3. States
- *
- * A server or client socket remains in this state for 2MSL (4 minutes)
- * after the connection has been town down, ...
- */
-
-#define DCCP_MSL (2 * 60 * HZ)
-
-static const char * const dccp_state_names[] = {
- [CT_DCCP_NONE] = "NONE",
- [CT_DCCP_REQUEST] = "REQUEST",
- [CT_DCCP_RESPOND] = "RESPOND",
- [CT_DCCP_PARTOPEN] = "PARTOPEN",
- [CT_DCCP_OPEN] = "OPEN",
- [CT_DCCP_CLOSEREQ] = "CLOSEREQ",
- [CT_DCCP_CLOSING] = "CLOSING",
- [CT_DCCP_TIMEWAIT] = "TIMEWAIT",
- [CT_DCCP_IGNORE] = "IGNORE",
- [CT_DCCP_INVALID] = "INVALID",
-};
-
-#define sNO CT_DCCP_NONE
-#define sRQ CT_DCCP_REQUEST
-#define sRS CT_DCCP_RESPOND
-#define sPO CT_DCCP_PARTOPEN
-#define sOP CT_DCCP_OPEN
-#define sCR CT_DCCP_CLOSEREQ
-#define sCG CT_DCCP_CLOSING
-#define sTW CT_DCCP_TIMEWAIT
-#define sIG CT_DCCP_IGNORE
-#define sIV CT_DCCP_INVALID
-
-/*
- * DCCP state transition table
- *
- * The assumption is the same as for TCP tracking:
- *
- * We are the man in the middle. All the packets go through us but might
- * get lost in transit to the destination. It is assumed that the destination
- * can't receive segments we haven't seen.
- *
- * The following states exist:
- *
- * NONE: Initial state, expecting Request
- * REQUEST: Request seen, waiting for Response from server
- * RESPOND: Response from server seen, waiting for Ack from client
- * PARTOPEN: Ack after Response seen, waiting for packet other than Response,
- * Reset or Sync from server
- * OPEN: Packet other than Response, Reset or Sync seen
- * CLOSEREQ: CloseReq from server seen, expecting Close from client
- * CLOSING: Close seen, expecting Reset
- * TIMEWAIT: Reset seen
- * IGNORE: Not determinable whether packet is valid
- *
- * Some states exist only on one side of the connection: REQUEST, RESPOND,
- * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to
- * the one it was in before.
- *
- * Packets are marked as ignored (sIG) if we don't know if they're valid
- * (for example a reincarnation of a connection we didn't notice is dead
- * already) and the server may send back a connection closing Reset or a
- * Response. They're also used for Sync/SyncAck packets, which we don't
- * care about.
- */
-static const u_int8_t
-dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = {
- [CT_DCCP_ROLE_CLIENT] = {
- [DCCP_PKT_REQUEST] = {
- /*
- * sNO -> sRQ Regular Request
- * sRQ -> sRQ Retransmitted Request or reincarnation
- * sRS -> sRS Retransmitted Request (apparently Response
- * got lost after we saw it) or reincarnation
- * sPO -> sIG Ignore, conntrack might be out of sync
- * sOP -> sIG Ignore, conntrack might be out of sync
- * sCR -> sIG Ignore, conntrack might be out of sync
- * sCG -> sIG Ignore, conntrack might be out of sync
- * sTW -> sRQ Reincarnation
- *
- * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */
- sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ,
- },
- [DCCP_PKT_RESPONSE] = {
- /*
- * sNO -> sIV Invalid
- * sRQ -> sIG Ignore, might be response to ignored Request
- * sRS -> sIG Ignore, might be response to ignored Request
- * sPO -> sIG Ignore, might be response to ignored Request
- * sOP -> sIG Ignore, might be response to ignored Request
- * sCR -> sIG Ignore, might be response to ignored Request
- * sCG -> sIG Ignore, might be response to ignored Request
- * sTW -> sIV Invalid, reincarnation in reverse direction
- * goes through sRQ
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV,
- },
- [DCCP_PKT_ACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.)
- * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN
- * sOP -> sOP Regular ACK, remain in OPEN
- * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.)
- * sCG -> sCG Ack in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
- },
- [DCCP_PKT_DATA] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.)
- * sOP -> sOP Regular Data packet
- * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.)
- * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV,
- },
- [DCCP_PKT_DATAACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.)
- * sPO -> sPO Remain in PARTOPEN state
- * sOP -> sOP Regular DataAck packet in OPEN state
- * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.)
- * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
- },
- [DCCP_PKT_CLOSEREQ] = {
- /*
- * CLOSEREQ may only be sent by the server.
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV
- },
- [DCCP_PKT_CLOSE] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sCG Client-initiated close
- * sOP -> sCG Client-initiated close
- * sCR -> sCG Close in response to CloseReq (8.3.)
- * sCG -> sCG Retransmit
- * sTW -> sIV Late retransmit, already in TIME_WAIT
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV
- },
- [DCCP_PKT_RESET] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.)
- * sRS -> sTW Response received without Request
- * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.)
- * sOP -> sTW Connection reset
- * sCR -> sTW Connection reset
- * sCG -> sTW Connection reset
- * sTW -> sIG Ignore (don't refresh timer)
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG
- },
- [DCCP_PKT_SYNC] = {
- /*
- * We currently ignore Sync packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- [DCCP_PKT_SYNCACK] = {
- /*
- * We currently ignore SyncAck packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- },
- [CT_DCCP_ROLE_SERVER] = {
- [DCCP_PKT_REQUEST] = {
- /*
- * sNO -> sIV Invalid
- * sRQ -> sIG Ignore, conntrack might be out of sync
- * sRS -> sIG Ignore, conntrack might be out of sync
- * sPO -> sIG Ignore, conntrack might be out of sync
- * sOP -> sIG Ignore, conntrack might be out of sync
- * sCR -> sIG Ignore, conntrack might be out of sync
- * sCG -> sIG Ignore, conntrack might be out of sync
- * sTW -> sRQ Reincarnation, must reverse roles
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ
- },
- [DCCP_PKT_RESPONSE] = {
- /*
- * sNO -> sIV Response without Request
- * sRQ -> sRS Response to clients Request
- * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT)
- * sPO -> sIG Response to an ignored Request or late retransmit
- * sOP -> sIG Ignore, might be response to ignored Request
- * sCR -> sIG Ignore, might be response to ignored Request
- * sCG -> sIG Ignore, might be response to ignored Request
- * sTW -> sIV Invalid, Request from client in sTW moves to sRQ
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV
- },
- [DCCP_PKT_ACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP Enter OPEN state (8.1.5.)
- * sOP -> sOP Regular Ack in OPEN state
- * sCR -> sIV Waiting for Close from client
- * sCG -> sCG Ack in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
- },
- [DCCP_PKT_DATA] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP Enter OPEN state (8.1.5.)
- * sOP -> sOP Regular Data packet in OPEN state
- * sCR -> sIV Waiting for Close from client
- * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
- },
- [DCCP_PKT_DATAACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP Enter OPEN state (8.1.5.)
- * sOP -> sOP Regular DataAck in OPEN state
- * sCR -> sIV Waiting for Close from client
- * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
- },
- [DCCP_PKT_CLOSEREQ] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.)
- * sOP -> sCR CloseReq in OPEN state
- * sCR -> sCR Retransmit
- * sCG -> sCR Simultaneous close, client sends another Close
- * sTW -> sIV Already closed
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV
- },
- [DCCP_PKT_CLOSE] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP -> sCG Move direcly to CLOSING
- * sOP -> sCG Move to CLOSING
- * sCR -> sIV Close after CloseReq is invalid
- * sCG -> sCG Retransmit
- * sTW -> sIV Already closed
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV
- },
- [DCCP_PKT_RESET] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sTW Reset in response to Request
- * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.)
- * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.)
- * sOP -> sTW
- * sCR -> sTW
- * sCG -> sTW
- * sTW -> sIG Ignore (don't refresh timer)
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */
- sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG
- },
- [DCCP_PKT_SYNC] = {
- /*
- * We currently ignore Sync packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- [DCCP_PKT_SYNCACK] = {
- /*
- * We currently ignore SyncAck packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- },
-};
-
-static noinline bool
-dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
- const struct dccp_hdr *dh,
- const struct nf_hook_state *hook_state)
-{
- struct net *net = nf_ct_net(ct);
- struct nf_dccp_net *dn;
- const char *msg;
- u_int8_t state;
-
- state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
- switch (state) {
- default:
- dn = nf_dccp_pernet(net);
- if (dn->dccp_loose == 0) {
- msg = "not picking up existing connection ";
- goto out_invalid;
- }
- break;
- case CT_DCCP_REQUEST:
- break;
- case CT_DCCP_INVALID:
- msg = "invalid state transition ";
- goto out_invalid;
- }
-
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
- ct->proto.dccp.state = CT_DCCP_NONE;
- ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
- ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
- ct->proto.dccp.handshake_seq = 0;
- return true;
-
-out_invalid:
- nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", msg);
- return false;
-}
-
-static u64 dccp_ack_seq(const struct dccp_hdr *dh)
-{
- const struct dccp_hdr_ack_bits *dhack;
-
- dhack = (void *)dh + __dccp_basic_hdr_len(dh);
- return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) +
- ntohl(dhack->dccph_ack_nr_low);
-}
-
-static bool dccp_error(const struct dccp_hdr *dh,
- struct sk_buff *skb, unsigned int dataoff,
- const struct nf_hook_state *state)
-{
- unsigned int dccp_len = skb->len - dataoff;
- unsigned int cscov;
- const char *msg;
-
- if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
- dh->dccph_doff * 4 > dccp_len) {
- msg = "nf_ct_dccp: truncated/malformed packet ";
- goto out_invalid;
- }
-
- cscov = dccp_len;
- if (dh->dccph_cscov) {
- cscov = (dh->dccph_cscov - 1) * 4;
- if (cscov > dccp_len) {
- msg = "nf_ct_dccp: bad checksum coverage ";
- goto out_invalid;
- }
- }
-
- if (state->hook == NF_INET_PRE_ROUTING &&
- state->net->ct.sysctl_checksum &&
- nf_checksum_partial(skb, state->hook, dataoff, cscov,
- IPPROTO_DCCP, state->pf)) {
- msg = "nf_ct_dccp: bad checksum ";
- goto out_invalid;
- }
-
- if (dh->dccph_type >= DCCP_PKT_INVALID) {
- msg = "nf_ct_dccp: reserved packet type ";
- goto out_invalid;
- }
- return false;
-out_invalid:
- nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg);
- return true;
-}
-
-int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
-{
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- struct dccp_hdr _dh, *dh;
- u_int8_t type, old_state, new_state;
- enum ct_dccp_roles role;
- unsigned int *timeouts;
-
- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
- if (!dh)
- return NF_DROP;
-
- if (dccp_error(dh, skb, dataoff, state))
- return -NF_ACCEPT;
-
- type = dh->dccph_type;
- if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state))
- return -NF_ACCEPT;
-
- if (type == DCCP_PKT_RESET &&
- !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- /* Tear down connection immediately if only reply is a RESET */
- nf_ct_kill_acct(ct, ctinfo, skb);
- return NF_ACCEPT;
- }
-
- spin_lock_bh(&ct->lock);
-
- role = ct->proto.dccp.role[dir];
- old_state = ct->proto.dccp.state;
- new_state = dccp_state_table[role][type][old_state];
-
- switch (new_state) {
- case CT_DCCP_REQUEST:
- if (old_state == CT_DCCP_TIMEWAIT &&
- role == CT_DCCP_ROLE_SERVER) {
- /* Reincarnation in the reverse direction: reopen and
- * reverse client/server roles. */
- ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER;
- }
- break;
- case CT_DCCP_RESPOND:
- if (old_state == CT_DCCP_REQUEST)
- ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
- break;
- case CT_DCCP_PARTOPEN:
- if (old_state == CT_DCCP_RESPOND &&
- type == DCCP_PKT_ACK &&
- dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq)
- set_bit(IPS_ASSURED_BIT, &ct->status);
- break;
- case CT_DCCP_IGNORE:
- /*
- * Connection tracking might be out of sync, so we ignore
- * packets that might establish a new connection and resync
- * if the server responds with a valid Response.
- */
- if (ct->proto.dccp.last_dir == !dir &&
- ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST &&
- type == DCCP_PKT_RESPONSE) {
- ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER;
- ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
- new_state = CT_DCCP_RESPOND;
- break;
- }
- ct->proto.dccp.last_dir = dir;
- ct->proto.dccp.last_pkt = type;
-
- spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid packet");
- return NF_ACCEPT;
- case CT_DCCP_INVALID:
- spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid state transition");
- return -NF_ACCEPT;
- }
-
- ct->proto.dccp.last_dir = dir;
- ct->proto.dccp.last_pkt = type;
- ct->proto.dccp.state = new_state;
- spin_unlock_bh(&ct->lock);
-
- if (new_state != old_state)
- nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
-
- timeouts = nf_ct_timeout_lookup(ct);
- if (!timeouts)
- timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout;
- nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
-
- return NF_ACCEPT;
-}
-
-static bool dccp_can_early_drop(const struct nf_conn *ct)
-{
- switch (ct->proto.dccp.state) {
- case CT_DCCP_CLOSEREQ:
- case CT_DCCP_CLOSING:
- case CT_DCCP_TIMEWAIT:
- return true;
- default:
- break;
- }
-
- return false;
-}
-
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
-static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
-{
- seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
-}
-#endif
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct, bool destroy)
-{
- struct nlattr *nest_parms;
-
- spin_lock_bh(&ct->lock);
- nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP);
- if (!nest_parms)
- goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state))
- goto nla_put_failure;
-
- if (destroy)
- goto skip_state;
-
- if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
- nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
- cpu_to_be64(ct->proto.dccp.handshake_seq),
- CTA_PROTOINFO_DCCP_PAD))
- goto nla_put_failure;
-skip_state:
- nla_nest_end(skb, nest_parms);
- spin_unlock_bh(&ct->lock);
-
- return 0;
-
-nla_put_failure:
- spin_unlock_bh(&ct->lock);
- return -1;
-}
-
-static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
- [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 },
- [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 },
- [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 },
- [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC },
-};
-
-#define DCCP_NLATTR_SIZE ( \
- NLA_ALIGN(NLA_HDRLEN + 1) + \
- NLA_ALIGN(NLA_HDRLEN + 1) + \
- NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \
- NLA_ALIGN(NLA_HDRLEN + 0))
-
-static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
-{
- struct nlattr *attr = cda[CTA_PROTOINFO_DCCP];
- struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1];
- int err;
-
- if (!attr)
- return 0;
-
- err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_DCCP_MAX, attr,
- dccp_nla_policy, NULL);
- if (err < 0)
- return err;
-
- if (!tb[CTA_PROTOINFO_DCCP_STATE] ||
- !tb[CTA_PROTOINFO_DCCP_ROLE] ||
- nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX ||
- nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) {
- return -EINVAL;
- }
-
- spin_lock_bh(&ct->lock);
- ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
- if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
- } else {
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
- ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
- }
- if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) {
- ct->proto.dccp.handshake_seq =
- be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]));
- }
- spin_unlock_bh(&ct->lock);
- return 0;
-}
-#endif
-
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_cttimeout.h>
-
-static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
- struct net *net, void *data)
-{
- struct nf_dccp_net *dn = nf_dccp_pernet(net);
- unsigned int *timeouts = data;
- int i;
-
- if (!timeouts)
- timeouts = dn->dccp_timeout;
-
- /* set default DCCP timeouts. */
- for (i=0; i<CT_DCCP_MAX; i++)
- timeouts[i] = dn->dccp_timeout[i];
-
- /* there's a 1:1 mapping between attributes and protocol states. */
- for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
- if (tb[i]) {
- timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
- }
- }
-
- timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST];
- return 0;
-}
-
-static int
-dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
-{
- const unsigned int *timeouts = data;
- int i;
-
- for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
- if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
- goto nla_put_failure;
- }
- return 0;
-
-nla_put_failure:
- return -ENOSPC;
-}
-
-static const struct nla_policy
-dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = {
- [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 },
-};
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-
-void nf_conntrack_dccp_init_net(struct net *net)
-{
- struct nf_dccp_net *dn = nf_dccp_pernet(net);
-
- /* default values */
- dn->dccp_loose = 1;
- dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ;
- dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ;
- dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ;
- dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL;
-
- /* timeouts[0] is unused, make it same as SYN_SENT so
- * ->timeouts[0] contains 'new' timeout, like udp or icmp.
- */
- dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST];
-}
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = {
- .l4proto = IPPROTO_DCCP,
- .can_early_drop = dccp_can_early_drop,
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
- .print_conntrack = dccp_print_conntrack,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_size = DCCP_NLATTR_SIZE,
- .to_nlattr = dccp_to_nlattr,
- .from_nlattr = nlattr_to_dccp,
- .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
- .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
- .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
- .nla_policy = nf_ct_port_nla_policy,
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- .ctnl_timeout = {
- .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
- .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
- .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
- .nla_policy = dccp_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-};
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 728eeb0aea87..af369e686fc5 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -205,6 +205,8 @@ int nf_conntrack_gre_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct nf_hook_state *state)
{
+ unsigned long status;
+
if (!nf_ct_is_confirmed(ct)) {
unsigned int *timeouts = nf_ct_timeout_lookup(ct);
@@ -217,11 +219,17 @@ int nf_conntrack_gre_packet(struct nf_conn *ct,
ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
}
+ status = READ_ONCE(ct->status);
/* If we've seen traffic both ways, this is a GRE connection.
* Extend timeout. */
- if (ct->status & IPS_SEEN_REPLY) {
+ if (status & IPS_SEEN_REPLY) {
nf_ct_refresh_acct(ct, ctinfo, skb,
ct->proto.gre.stream_timeout);
+
+ /* never set ASSURED for IPS_NAT_CLASH, they time out soon */
+ if (unlikely((status & IPS_NAT_CLASH)))
+ return NF_ACCEPT;
+
/* Also, more likely to be important, and not a probe. */
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
@@ -296,6 +304,7 @@ void nf_conntrack_gre_init_net(struct net *net)
/* protocol helper struct */
const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = {
.l4proto = IPPROTO_GRE,
+ .allow_clash = true,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = gre_print_conntrack,
#endif
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 1020d67600a9..327b8059025d 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -62,7 +62,9 @@ static const u_int8_t noct_valid_new[] = {
[NDISC_ROUTER_ADVERTISEMENT - 130] = 1,
[NDISC_NEIGHBOUR_SOLICITATION - 130] = 1,
[NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1,
- [ICMPV6_MLD2_REPORT - 130] = 1
+ [ICMPV6_MLD2_REPORT - 130] = 1,
+ [ICMPV6_MRDISC_ADV - 130] = 1,
+ [ICMPV6_MRDISC_SOL - 130] = 1
};
bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 945dd40e7077..7c6f7c9f7332 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -39,20 +39,15 @@ static const char *const sctp_conntrack_names[] = {
[SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT",
};
-#define SECS * HZ
-#define MINS * 60 SECS
-#define HOURS * 60 MINS
-#define DAYS * 24 HOURS
-
static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
- [SCTP_CONNTRACK_CLOSED] = 10 SECS,
- [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS,
- [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS,
- [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS,
- [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000,
- [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000,
- [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
- [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS,
+ [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10),
+ [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210),
+ [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30),
};
#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1
@@ -105,14 +100,14 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */
-/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW},
+/* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW},
/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},
/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL},
/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA},
/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/
/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */
+/* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */
/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL},
/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
@@ -126,7 +121,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV},
/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV},
/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */
+/* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */
/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV},
/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV},
/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
@@ -142,17 +137,19 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
}
#endif
+/* do_basic_checks ensures sch->length > 0, do not use before */
#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \
for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \
- ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))) && \
- (sch)->length; \
+ (offset) < (skb)->len && \
+ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \
(offset) += (ntohs((sch)->length) + 3) & ~3, (count)++)
/* Some validity checks to make sure the chunks are fine */
static int do_basic_checks(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- unsigned long *map)
+ unsigned long *map,
+ const struct nf_hook_state *state)
{
u_int32_t offset, count;
struct sctp_chunkhdr _sch, *sch;
@@ -161,8 +158,6 @@ static int do_basic_checks(struct nf_conn *ct,
flag = 0;
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
- pr_debug("Chunk Num: %d Type: %d\n", count, sch->type);
-
if (sch->type == SCTP_CID_INIT ||
sch->type == SCTP_CID_INIT_ACK ||
sch->type == SCTP_CID_SHUTDOWN_COMPLETE)
@@ -177,7 +172,9 @@ static int do_basic_checks(struct nf_conn *ct,
sch->type == SCTP_CID_COOKIE_ECHO ||
flag) &&
count != 0) || !sch->length) {
- pr_debug("Basic checks failed\n");
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "%s failed. chunk num %d, type %d, len %d flag %d\n",
+ __func__, count, sch->type, sch->length, flag);
return 1;
}
@@ -185,7 +182,6 @@ static int do_basic_checks(struct nf_conn *ct,
set_bit(sch->type, map);
}
- pr_debug("Basic checks passed\n");
return count == 0;
}
@@ -195,64 +191,47 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
{
int i;
- pr_debug("Chunk type: %d\n", chunk_type);
-
switch (chunk_type) {
case SCTP_CID_INIT:
- pr_debug("SCTP_CID_INIT\n");
i = 0;
break;
case SCTP_CID_INIT_ACK:
- pr_debug("SCTP_CID_INIT_ACK\n");
i = 1;
break;
case SCTP_CID_ABORT:
- pr_debug("SCTP_CID_ABORT\n");
i = 2;
break;
case SCTP_CID_SHUTDOWN:
- pr_debug("SCTP_CID_SHUTDOWN\n");
i = 3;
break;
case SCTP_CID_SHUTDOWN_ACK:
- pr_debug("SCTP_CID_SHUTDOWN_ACK\n");
i = 4;
break;
case SCTP_CID_ERROR:
- pr_debug("SCTP_CID_ERROR\n");
i = 5;
break;
case SCTP_CID_COOKIE_ECHO:
- pr_debug("SCTP_CID_COOKIE_ECHO\n");
i = 6;
break;
case SCTP_CID_COOKIE_ACK:
- pr_debug("SCTP_CID_COOKIE_ACK\n");
i = 7;
break;
case SCTP_CID_SHUTDOWN_COMPLETE:
- pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
i = 8;
break;
case SCTP_CID_HEARTBEAT:
- pr_debug("SCTP_CID_HEARTBEAT");
i = 9;
break;
case SCTP_CID_HEARTBEAT_ACK:
- pr_debug("SCTP_CID_HEARTBEAT_ACK");
i = 10;
break;
default:
/* Other chunks like DATA or SACK do not change the state */
- pr_debug("Unknown chunk type, Will stay in %s\n",
- sctp_conntrack_names[cur_state]);
+ pr_debug("Unknown chunk type %d, Will stay in %s\n",
+ chunk_type, sctp_conntrack_names[cur_state]);
return cur_state;
}
- pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
- dir, sctp_conntrack_names[cur_state], chunk_type,
- sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
-
return sctp_conntracks[dir][i][cur_state];
}
@@ -299,7 +278,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
pr_debug("Setting vtag %x for secondary conntrack\n",
sh->vtag);
ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
- } else {
+ } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) {
/* If it is a shutdown ack OOTB packet, we expect a return
shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
pr_debug("Setting vtag %x for new conn OOTB\n",
@@ -369,7 +348,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
if (sh == NULL)
goto out;
- if (do_basic_checks(ct, skb, dataoff, map) != 0)
+ if (do_basic_checks(ct, skb, dataoff, map, state) != 0)
goto out;
if (!nf_ct_is_confirmed(ct)) {
@@ -392,7 +371,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
!test_bit(SCTP_CID_HEARTBEAT, map) &&
!test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
sh->vtag != ct->proto.sctp.vtag[dir]) {
- pr_debug("Verification tag check failed\n");
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "verification tag check failed %x vs %x for dir %d",
+ sh->vtag, ct->proto.sctp.vtag[dir], dir);
goto out;
}
@@ -426,6 +407,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
/* (D) vtag must be same as init_vtag as found in INIT_ACK */
if (sh->vtag != ct->proto.sctp.vtag[dir])
goto out_unlock;
+ } else if (sch->type == SCTP_CID_COOKIE_ACK) {
+ ct->proto.sctp.init[dir] = 0;
+ ct->proto.sctp.init[!dir] = 0;
} else if (sch->type == SCTP_CID_HEARTBEAT) {
if (ct->proto.sctp.vtag[dir] == 0) {
pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir);
@@ -467,23 +451,26 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
/* Invalid */
if (new_state == SCTP_CONNTRACK_MAX) {
- pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u "
- "conntrack=%u\n",
- dir, sch->type, old_state);
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "Invalid, old_state %d, dir %d, type %d",
+ old_state, dir, sch->type);
+
goto out_unlock;
}
/* If it is an INIT or an INIT ACK note down the vtag */
- if (sch->type == SCTP_CID_INIT ||
- sch->type == SCTP_CID_INIT_ACK) {
- struct sctp_inithdr _inithdr, *ih;
+ if (sch->type == SCTP_CID_INIT) {
+ struct sctp_inithdr _ih, *ih;
- ih = skb_header_pointer(skb, offset + sizeof(_sch),
- sizeof(_inithdr), &_inithdr);
- if (ih == NULL)
+ ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
+ if (!ih)
goto out_unlock;
- pr_debug("Setting vtag %x for dir %d\n",
- ih->init_tag, !dir);
+
+ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir])
+ ct->proto.sctp.init[!dir] = 0;
+ ct->proto.sctp.init[dir] = 1;
+
+ pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
ct->proto.sctp.vtag[!dir] = ih->init_tag;
/* don't renew timeout on init retransmit so
@@ -494,6 +481,24 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
old_state == SCTP_CONNTRACK_CLOSED &&
nf_ct_is_confirmed(ct))
ignore = true;
+ } else if (sch->type == SCTP_CID_INIT_ACK) {
+ struct sctp_inithdr _ih, *ih;
+ __be32 vtag;
+
+ ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
+ if (!ih)
+ goto out_unlock;
+
+ vtag = ct->proto.sctp.vtag[!dir];
+ if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag)
+ goto out_unlock;
+ /* collision */
+ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] &&
+ vtag != ih->init_tag)
+ goto out_unlock;
+
+ pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
+ ct->proto.sctp.vtag[!dir] = ih->init_tag;
}
ct->proto.sctp.state = new_state;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 3ac1af6f59fc..0c1d086e96cb 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -14,7 +14,7 @@
#include <linux/skbuff.h>
#include <linux/ipv6.h>
#include <net/ip6_checksum.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/tcp.h>
@@ -457,7 +457,8 @@ static void tcp_init_sender(struct ip_ct_tcp_state *sender,
const struct sk_buff *skb,
unsigned int dataoff,
const struct tcphdr *tcph,
- u32 end, u32 win)
+ u32 end, u32 win,
+ enum ip_conntrack_dir dir)
{
/* SYN-ACK in reply to a SYN
* or SYN from reply direction in simultaneous open.
@@ -471,7 +472,8 @@ static void tcp_init_sender(struct ip_ct_tcp_state *sender,
* Both sides must send the Window Scale option
* to enable window scaling in either direction.
*/
- if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
+ if (dir == IP_CT_DIR_REPLY &&
+ !(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) {
sender->td_scale = 0;
receiver->td_scale = 0;
@@ -542,7 +544,7 @@ tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir,
if (tcph->syn) {
tcp_init_sender(sender, receiver,
skb, dataoff, tcph,
- end, win);
+ end, win, dir);
if (!tcph->ack)
/* Simultaneous open */
return NFCT_TCP_ACCEPT;
@@ -585,7 +587,7 @@ tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir,
*/
tcp_init_sender(sender, receiver,
skb, dataoff, tcph,
- end, win);
+ end, win, dir);
if (dir == IP_CT_DIR_REPLY && !tcph->ack)
return NFCT_TCP_ACCEPT;
@@ -835,7 +837,8 @@ static bool tcp_error(const struct tcphdr *th,
static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff,
- const struct tcphdr *th)
+ const struct tcphdr *th,
+ const struct nf_hook_state *state)
{
enum tcp_conntrack new_state;
struct net *net = nf_ct_net(ct);
@@ -846,7 +849,7 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* Invalid: delete conntrack */
if (new_state >= TCP_CONNTRACK_MAX) {
- pr_debug("nf_ct_tcp: invalid new deleting.\n");
+ tcp_error_log(skb, state, "invalid new");
return false;
}
@@ -911,6 +914,41 @@ static bool tcp_can_early_drop(const struct nf_conn *ct)
return false;
}
+void nf_conntrack_tcp_set_closing(struct nf_conn *ct)
+{
+ enum tcp_conntrack old_state;
+ const unsigned int *timeouts;
+ u32 timeout;
+
+ if (!nf_ct_is_confirmed(ct))
+ return;
+
+ spin_lock_bh(&ct->lock);
+ old_state = ct->proto.tcp.state;
+ ct->proto.tcp.state = TCP_CONNTRACK_CLOSE;
+
+ if (old_state == TCP_CONNTRACK_CLOSE ||
+ test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
+ spin_unlock_bh(&ct->lock);
+ return;
+ }
+
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts) {
+ const struct nf_tcp_net *tn;
+
+ tn = nf_tcp_pernet(nf_ct_net(ct));
+ timeouts = tn->timeouts;
+ }
+
+ timeout = timeouts[TCP_CONNTRACK_CLOSE];
+ WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp);
+
+ spin_unlock_bh(&ct->lock);
+
+ nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+}
+
static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state)
{
state->td_end = 0;
@@ -930,7 +968,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
{
struct net *net = nf_ct_net(ct);
struct nf_tcp_net *tn = nf_tcp_pernet(net);
- struct nf_conntrack_tuple *tuple;
enum tcp_conntrack new_state, old_state;
unsigned int index, *timeouts;
enum nf_ct_tcp_action res;
@@ -946,7 +983,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
if (tcp_error(th, skb, dataoff, state))
return -NF_ACCEPT;
- if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
+ if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th, state))
return -NF_ACCEPT;
spin_lock_bh(&ct->lock);
@@ -954,7 +991,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
dir = CTINFO2DIR(ctinfo);
index = get_conntrack_index(th);
new_state = tcp_conntracks[dir][index][old_state];
- tuple = &ct->tuplehash[dir].tuple;
switch (new_state) {
case TCP_CONNTRACK_SYN_SENT:
@@ -1232,13 +1268,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
ct->proto.tcp.last_index = index;
ct->proto.tcp.last_dir = dir;
- pr_debug("tcp_conntracks: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
- (th->syn ? 1 : 0), (th->ack ? 1 : 0),
- (th->fin ? 1 : 0), (th->rst ? 1 : 0),
- old_state, new_state);
-
ct->proto.tcp.state = new_state;
if (old_state != new_state
&& new_state == TCP_CONNTRACK_FIN_WAIT)
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 3b516cffc779..0030fbe8885c 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -88,6 +88,7 @@ int nf_conntrack_udp_packet(struct nf_conn *ct,
const struct nf_hook_state *state)
{
unsigned int *timeouts;
+ unsigned long status;
if (udp_error(skb, dataoff, state))
return -NF_ACCEPT;
@@ -96,26 +97,27 @@ int nf_conntrack_udp_packet(struct nf_conn *ct,
if (!timeouts)
timeouts = udp_get_timeouts(nf_ct_net(ct));
- if (!nf_ct_is_confirmed(ct))
+ status = READ_ONCE(ct->status);
+ if ((status & IPS_CONFIRMED) == 0)
ct->proto.udp.stream_ts = 2 * HZ + jiffies;
/* If we've seen traffic both ways, this is some kind of UDP
* stream. Set Assured.
*/
- if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ if (status & IPS_SEEN_REPLY) {
unsigned long extra = timeouts[UDP_CT_UNREPLIED];
bool stream = false;
/* Still active after two seconds? Extend timeout. */
if (time_after(jiffies, ct->proto.udp.stream_ts)) {
extra = timeouts[UDP_CT_REPLIED];
- stream = true;
+ stream = (status & IPS_ASSURED) == 0;
}
nf_ct_refresh_acct(ct, ctinfo, skb, extra);
/* never set ASSURED for IPS_NAT_CLASH, they time out soon */
- if (unlikely((ct->status & IPS_NAT_CLASH)))
+ if (unlikely((status & IPS_NAT_CLASH)))
return NF_ACCEPT;
/* Also, more likely to be important, and not a probe */
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 77f5e82d8e3f..ca748f8dbff1 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -611,7 +611,7 @@ int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr,
start += strlen(name);
*val = simple_strtoul(start, &end, 0);
if (start == end)
- return 0;
+ return -1;
if (matchoff && matchlen) {
*matchoff = start - dptr;
*matchlen = end - start;
@@ -1553,7 +1553,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
- nf_ct_refresh(ct, skb, sip_timeout * HZ);
+ nf_ct_refresh(ct, sip_timeout * HZ);
if (unlikely(skb_linearize(skb)))
return NF_DROP;
@@ -1624,7 +1624,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
- nf_ct_refresh(ct, skb, sip_timeout * HZ);
+ nf_ct_refresh(ct, sip_timeout * HZ);
if (unlikely(skb_linearize(skb)))
return NF_DROP;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 460294bd4b60..207b240b14e5 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -14,6 +14,7 @@
#include <linux/sysctl.h>
#endif
+#include <net/netfilter/nf_log.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
@@ -22,9 +23,6 @@
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_timestamp.h>
-#ifdef CONFIG_LWTUNNEL
-#include <net/netfilter/nf_hooks_lwtunnel.h>
-#endif
#include <linux/rculist_nulls.h>
static bool enable_hooks __read_mostly;
@@ -70,11 +68,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
ntohs(tuple->dst.u.udp.port));
break;
- case IPPROTO_DCCP:
- seq_printf(s, "sport=%hu dport=%hu ",
- ntohs(tuple->src.u.dccp.port),
- ntohs(tuple->dst.u.dccp.port));
- break;
case IPPROTO_SCTP:
seq_printf(s, "sport=%hu dport=%hu ",
ntohs(tuple->src.u.sctp.port),
@@ -101,69 +94,87 @@ struct ct_iter_state {
struct seq_net_private p;
struct hlist_nulls_head *hash;
unsigned int htable_size;
+ unsigned int skip_elems;
unsigned int bucket;
u_int64_t time_now;
};
-static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
+static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net,
+ struct ct_iter_state *st)
{
- struct ct_iter_state *st = seq->private;
+ struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
+ unsigned int i;
- for (st->bucket = 0;
- st->bucket < st->htable_size;
- st->bucket++) {
- n = rcu_dereference(
- hlist_nulls_first_rcu(&st->hash[st->bucket]));
- if (!is_a_nulls(n))
- return n;
- }
- return NULL;
-}
+ for (i = st->bucket; i < st->htable_size; i++) {
+ unsigned int skip = 0;
-static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
- struct hlist_nulls_node *head)
-{
- struct ct_iter_state *st = seq->private;
+restart:
+ hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ struct hlist_nulls_node *tmp = n;
+
+ if (!net_eq(net, nf_ct_net(ct)))
+ continue;
+
+ if (++skip <= st->skip_elems)
+ continue;
+
+ /* h should be returned, skip to nulls marker. */
+ while (!is_a_nulls(tmp))
+ tmp = rcu_dereference(hlist_nulls_next_rcu(tmp));
+
+ /* check if h is still linked to hash[i] */
+ if (get_nulls_value(tmp) != i) {
+ skip = 0;
+ goto restart;
+ }
- head = rcu_dereference(hlist_nulls_next_rcu(head));
- while (is_a_nulls(head)) {
- if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= st->htable_size)
- return NULL;
+ st->skip_elems = skip;
+ st->bucket = i;
+ return h;
}
- head = rcu_dereference(
- hlist_nulls_first_rcu(&st->hash[st->bucket]));
- }
- return head;
-}
-static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct hlist_nulls_node *head = ct_get_first(seq);
+ skip = 0;
+ if (get_nulls_value(n) != i)
+ goto restart;
- if (head)
- while (pos && (head = ct_get_next(seq, head)))
- pos--;
- return pos ? NULL : head;
+ st->skip_elems = 0;
+ }
+
+ st->bucket = i;
+ return NULL;
}
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
struct ct_iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
st->time_now = ktime_get_real_ns();
rcu_read_lock();
nf_conntrack_get_ht(&st->hash, &st->htable_size);
- return ct_get_idx(seq, *pos);
+
+ if (*pos == 0) {
+ st->skip_elems = 0;
+ st->bucket = 0;
+ } else if (st->skip_elems) {
+ /* resume from last dumped entry */
+ st->skip_elems--;
+ }
+
+ return ct_get_next(net, st);
}
static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
+ struct ct_iter_state *st = s->private;
+ struct net *net = seq_file_net(s);
+
(*pos)++;
- return ct_get_next(s, v);
+ return ct_get_next(net, st);
}
static void ct_seq_stop(struct seq_file *s, void *v)
@@ -175,17 +186,16 @@ static void ct_seq_stop(struct seq_file *s, void *v)
#ifdef CONFIG_NF_CONNTRACK_SECMARK
static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
{
+ struct lsm_context ctx;
int ret;
- u32 len;
- char *secctx;
- ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, &ctx);
+ if (ret < 0)
return;
- seq_printf(s, "secctx=%s ", secctx);
+ seq_printf(s, "secctx=%s ", ctx.context);
- security_release_secctx(secctx, len);
+ security_release_secctx(&ctx);
}
#else
static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
@@ -265,7 +275,6 @@ static const char* l4proto_name(u16 proto)
case IPPROTO_ICMP: return "icmp";
case IPPROTO_TCP: return "tcp";
case IPPROTO_UDP: return "udp";
- case IPPROTO_DCCP: return "dccp";
case IPPROTO_GRE: return "gre";
case IPPROTO_SCTP: return "sctp";
case IPPROTO_UDPLITE: return "udplite";
@@ -275,7 +284,7 @@ static const char* l4proto_name(u16 proto)
return "unknown";
}
-static unsigned int
+static void
seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
{
struct nf_conn_acct *acct;
@@ -283,14 +292,12 @@ seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
acct = nf_conn_acct_find(ct);
if (!acct)
- return 0;
+ return;
counter = acct->counter;
seq_printf(s, "packets=%llu bytes=%llu ",
(unsigned long long)atomic64_read(&counter[dir].packets),
(unsigned long long)atomic64_read(&counter[dir].bytes));
-
- return 0;
}
/* return 0 on success, 1 in case of error */
@@ -310,6 +317,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
smp_acquire__after_ctrl_dep();
if (nf_ct_should_gc(ct)) {
+ struct ct_iter_state *st = s->private;
+
+ st->skip_elems--;
nf_ct_kill(ct);
goto release;
}
@@ -342,8 +352,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (seq_has_overflowed(s))
goto release;
- if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
- goto release;
+ seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL);
if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
seq_puts(s, "[UNREPLIED] ");
@@ -352,8 +361,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
- if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
- goto release;
+ seq_print_acct(s, ct, IP_CT_DIR_REPLY);
if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status))
seq_puts(s, "[HW_OFFLOAD] ");
@@ -531,7 +539,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_count);
static unsigned int nf_conntrack_htable_size_user __read_mostly;
static int
-nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
+nf_conntrack_hash_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -551,6 +559,29 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
return ret;
}
+static int
+nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, i;
+
+ ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+ if (ret < 0 || !write)
+ return ret;
+
+ if (*(u8 *)table->data == 0)
+ return 0;
+
+ /* Load nf_log_syslog only if no logger is currently registered */
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ if (nf_log_is_registered(i))
+ return 0;
+ }
+ request_module("%s", "nf_log_syslog");
+
+ return 0;
+}
+
static struct ctl_table_header *nf_ct_netfilter_header;
enum nf_ct_sysctl_index {
@@ -602,36 +633,23 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT,
#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT,
- NF_SYSCTL_CT_PROTO_DCCP_LOOSE,
-#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
NF_SYSCTL_CT_PROTO_TIMEOUT_GRE,
NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM,
#endif
-#ifdef CONFIG_LWTUNNEL
- NF_SYSCTL_CT_LWTUNNEL,
-#endif
- __NF_SYSCTL_CT_LAST_SYSCTL,
+ NF_SYSCTL_CT_LAST_SYSCTL,
};
-#define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1)
-
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
.data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
[NF_SYSCTL_CT_COUNT] = {
.procname = "nf_conntrack_count",
@@ -660,14 +678,16 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.data = &init_net.ct.sysctl_log_invalid,
.maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dou8vec_minmax,
+ .proc_handler = nf_conntrack_log_invalid_sysctl,
},
[NF_SYSCTL_CT_EXPECT_MAX] = {
.procname = "nf_conntrack_expect_max",
.data = &nf_ct_expect_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
[NF_SYSCTL_CT_ACCT] = {
.procname = "nf_conntrack_acct",
@@ -886,58 +906,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = {
- .procname = "nf_conntrack_dccp_timeout_request",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND] = {
- .procname = "nf_conntrack_dccp_timeout_respond",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN] = {
- .procname = "nf_conntrack_dccp_timeout_partopen",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN] = {
- .procname = "nf_conntrack_dccp_timeout_open",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ] = {
- .procname = "nf_conntrack_dccp_timeout_closereq",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING] = {
- .procname = "nf_conntrack_dccp_timeout_closing",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT] = {
- .procname = "nf_conntrack_dccp_timeout_timewait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = {
- .procname = "nf_conntrack_dccp_loose",
- .maxlen = sizeof(u8),
- .mode = 0644,
- .proc_handler = proc_dou8vec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = {
.procname = "nf_conntrack_gre_timeout",
@@ -952,16 +920,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
#endif
-#ifdef CONFIG_LWTUNNEL
- [NF_SYSCTL_CT_LWTUNNEL] = {
- .procname = "nf_hooks_lwtunnel",
- .data = NULL,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = nf_hooks_lwtunnel_sysctl_handler,
- },
-#endif
- {}
};
static struct ctl_table nf_ct_netfilter_table[] = {
@@ -970,9 +928,10 @@ static struct ctl_table nf_ct_netfilter_table[] = {
.data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
- { }
};
static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
@@ -1032,29 +991,6 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,
#endif
}
-static void nf_conntrack_standalone_init_dccp_sysctl(struct net *net,
- struct ctl_table *table)
-{
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- struct nf_dccp_net *dn = nf_dccp_pernet(net);
-
-#define XASSIGN(XNAME, dn) \
- table[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_ ## XNAME].data = \
- &(dn)->dccp_timeout[CT_DCCP_ ## XNAME]
-
- XASSIGN(REQUEST, dn);
- XASSIGN(RESPOND, dn);
- XASSIGN(PARTOPEN, dn);
- XASSIGN(OPEN, dn);
- XASSIGN(CLOSEREQ, dn);
- XASSIGN(CLOSING, dn);
- XASSIGN(TIMEWAIT, dn);
-#undef XASSIGN
-
- table[NF_SYSCTL_CT_PROTO_DCCP_LOOSE].data = &dn->dccp_loose;
-#endif
-}
-
static void nf_conntrack_standalone_init_gre_sysctl(struct net *net,
struct ctl_table *table)
{
@@ -1100,7 +1036,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
nf_conntrack_standalone_init_tcp_sysctl(net, table);
nf_conntrack_standalone_init_sctp_sysctl(net, table);
- nf_conntrack_standalone_init_dccp_sysctl(net, table);
nf_conntrack_standalone_init_gre_sysctl(net, table);
/* Don't allow non-init_net ns to alter global sysctls */
@@ -1110,7 +1045,9 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
}
- cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table);
+ cnet->sysctl_header = register_net_sysctl_sz(net, "net/netfilter",
+ table,
+ ARRAY_SIZE(nf_ct_sysctl_table));
if (!cnet->sysctl_header)
goto out_unregister_netfilter;
@@ -1124,7 +1061,7 @@ out_unregister_netfilter:
static void nf_conntrack_standalone_fini_sysctl(struct net *net)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- struct ctl_table *table;
+ const struct ctl_table *table;
table = cnet->sysctl_header->ctl_table_arg;
unregister_net_sysctl_table(cnet->sysctl_header);
@@ -1222,11 +1159,12 @@ static int __init nf_conntrack_standalone_init(void)
nf_conntrack_htable_size_user = nf_conntrack_htable_size;
#endif
+ nf_conntrack_init_end();
+
ret = register_pernet_subsys(&nf_conntrack_net_ops);
if (ret < 0)
goto out_pernet;
- nf_conntrack_init_end();
return 0;
out_pernet:
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index a8e2425e43b0..fab8b9011098 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -15,12 +15,26 @@
#define NF_RECURSION_LIMIT 2
-static DEFINE_PER_CPU(u8, nf_dup_skb_recursion);
+#ifndef CONFIG_PREEMPT_RT
+static u8 *nf_get_nf_dup_skb_recursion(void)
+{
+ return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
+}
+#else
+
+static u8 *nf_get_nf_dup_skb_recursion(void)
+{
+ return &current->net_xmit.nf_dup_skb_recursion;
+}
+
+#endif
static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
enum nf_dev_hooks hook)
{
- if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT)
+ u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
+
+ if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT)
goto err;
if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
@@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
skb_clear_tstamp(skb);
- __this_cpu_inc(nf_dup_skb_recursion);
+ (*nf_dup_skb_recursion)++;
dev_queue_xmit(skb);
- __this_cpu_dec(nf_dup_skb_recursion);
+ (*nf_dup_skb_recursion)--;
return;
err:
kfree_skb(skb);
diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c
new file mode 100644
index 000000000000..4a5f5195f2d2
--- /dev/null
+++ b/net/netfilter/nf_flow_table_bpf.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Flow Table Helpers for XDP hook
+ *
+ * These are called from the XDP programs.
+ * Note that it is allowed to break compatibility for these functions since
+ * the interface they are exposed through to BPF programs is explicitly
+ * unstable.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <net/xdp.h>
+
+/* bpf_flowtable_opts - options for bpf flowtable helpers
+ * @error: out parameter, set for any encountered error
+ */
+struct bpf_flowtable_opts {
+ s32 error;
+};
+
+enum {
+ NF_BPF_FLOWTABLE_OPTS_SZ = 4,
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in nf_flow_table BTF");
+
+__bpf_kfunc_start_defs();
+
+static struct flow_offload_tuple_rhash *
+bpf_xdp_flow_tuple_lookup(struct net_device *dev,
+ struct flow_offload_tuple *tuple, __be16 proto)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *nf_flow_table;
+ struct flow_offload *nf_flow;
+
+ nf_flow_table = nf_flowtable_by_dev(dev);
+ if (!nf_flow_table)
+ return ERR_PTR(-ENOENT);
+
+ tuplehash = flow_offload_lookup(nf_flow_table, tuple);
+ if (!tuplehash)
+ return ERR_PTR(-ENOENT);
+
+ nf_flow = container_of(tuplehash, struct flow_offload,
+ tuplehash[tuplehash->tuple.dir]);
+ flow_offload_refresh(nf_flow_table, nf_flow, false);
+
+ return tuplehash;
+}
+
+__bpf_kfunc struct flow_offload_tuple_rhash *
+bpf_xdp_flow_lookup(struct xdp_md *ctx, struct bpf_fib_lookup *fib_tuple,
+ struct bpf_flowtable_opts *opts, u32 opts_len)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+ struct flow_offload_tuple tuple = {
+ .iifidx = fib_tuple->ifindex,
+ .l3proto = fib_tuple->family,
+ .l4proto = fib_tuple->l4_protocol,
+ .src_port = fib_tuple->sport,
+ .dst_port = fib_tuple->dport,
+ };
+ struct flow_offload_tuple_rhash *tuplehash;
+ __be16 proto;
+
+ if (opts_len != NF_BPF_FLOWTABLE_OPTS_SZ) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+
+ switch (fib_tuple->family) {
+ case AF_INET:
+ tuple.src_v4.s_addr = fib_tuple->ipv4_src;
+ tuple.dst_v4.s_addr = fib_tuple->ipv4_dst;
+ proto = htons(ETH_P_IP);
+ break;
+ case AF_INET6:
+ tuple.src_v6 = *(struct in6_addr *)&fib_tuple->ipv6_src;
+ tuple.dst_v6 = *(struct in6_addr *)&fib_tuple->ipv6_dst;
+ proto = htons(ETH_P_IPV6);
+ break;
+ default:
+ opts->error = -EAFNOSUPPORT;
+ return NULL;
+ }
+
+ tuplehash = bpf_xdp_flow_tuple_lookup(xdp->rxq->dev, &tuple, proto);
+ if (IS_ERR(tuplehash)) {
+ opts->error = PTR_ERR(tuplehash);
+ return NULL;
+ }
+
+ return tuplehash;
+}
+
+__diag_pop()
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(nf_ft_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_KFUNCS_END(nf_ft_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_flow_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_ft_kfunc_set,
+};
+
+int nf_flow_register_bpf(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+ &nf_flow_kfunc_set);
+}
+EXPORT_SYMBOL_GPL(nf_flow_register_bpf);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 81c26a96c30b..06e8251a6644 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -77,22 +77,28 @@ EXPORT_SYMBOL_GPL(flow_offload_alloc);
static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
{
- const struct rt6_info *rt;
-
- if (flow_tuple->l3proto == NFPROTO_IPV6) {
- rt = (const struct rt6_info *)flow_tuple->dst_cache;
- return rt6_get_cookie(rt);
- }
+ if (flow_tuple->l3proto == NFPROTO_IPV6)
+ return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache));
return 0;
}
+static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route,
+ enum flow_offload_tuple_dir dir)
+{
+ struct dst_entry *dst = route->tuple[dir].dst;
+
+ route->tuple[dir].dst = NULL;
+
+ return dst;
+}
+
static int flow_offload_fill_route(struct flow_offload *flow,
- const struct nf_flow_route *route,
+ struct nf_flow_route *route,
enum flow_offload_tuple_dir dir)
{
struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
- struct dst_entry *dst = route->tuple[dir].dst;
+ struct dst_entry *dst = nft_route_dst_fetch(route, dir);
int i, j = 0;
switch (flow_tuple->l3proto) {
@@ -112,7 +118,10 @@ static int flow_offload_fill_route(struct flow_offload *flow,
flow_tuple->in_vlan_ingress |= BIT(j);
j++;
}
+
+ flow_tuple->tun = route->tuple[dir].in.tun;
flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+ flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
switch (route->tuple[dir].xmit_type) {
case FLOW_OFFLOAD_XMIT_DIRECT:
@@ -121,13 +130,11 @@ static int flow_offload_fill_route(struct flow_offload *flow,
memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
ETH_ALEN);
flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
- flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex;
+ dst_release(dst);
break;
case FLOW_OFFLOAD_XMIT_XFRM:
case FLOW_OFFLOAD_XMIT_NEIGH:
- if (!dst_hold_safe(route->tuple[dir].dst))
- return -1;
-
+ flow_tuple->ifidx = route->tuple[dir].out.ifindex;
flow_tuple->dst_cache = dst;
flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
break;
@@ -148,63 +155,95 @@ static void nft_flow_dst_release(struct flow_offload *flow,
dst_release(flow->tuplehash[dir].tuple.dst_cache);
}
-int flow_offload_route_init(struct flow_offload *flow,
- const struct nf_flow_route *route)
+void flow_offload_route_init(struct flow_offload *flow,
+ struct nf_flow_route *route)
{
- int err;
-
- err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
- if (err < 0)
- return err;
-
- err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
- if (err < 0)
- goto err_route_reply;
-
+ flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
flow->type = NF_FLOW_OFFLOAD_ROUTE;
-
- return 0;
-
-err_route_reply:
- nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
-
- return err;
}
EXPORT_SYMBOL_GPL(flow_offload_route_init);
-static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
{
+ return nf_flow_timeout_delta(flow->timeout) <= 0;
+}
+
+static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
+{
+ struct ip_ct_tcp *tcp = &ct->proto.tcp;
+
+ spin_lock_bh(&ct->lock);
+ if (tcp->state != tcp_state)
+ tcp->state = tcp_state;
+
+ /* syn packet triggers the TCP reopen case from conntrack. */
+ if (tcp->state == TCP_CONNTRACK_CLOSE)
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+
+ /* Conntrack state is outdated due to offload bypass.
+ * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
+ * TCP reset validation will fail.
+ */
tcp->seen[0].td_maxwin = 0;
+ tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
tcp->seen[1].td_maxwin = 0;
+ tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
+ spin_unlock_bh(&ct->lock);
}
-static void flow_offload_fixup_ct(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct flow_offload *flow)
{
+ struct nf_conn *ct = flow->ct;
struct net *net = nf_ct_net(ct);
int l4num = nf_ct_protonum(ct);
+ bool expired, closing = false;
+ u32 offload_timeout = 0;
s32 timeout;
if (l4num == IPPROTO_TCP) {
- struct nf_tcp_net *tn = nf_tcp_pernet(net);
-
- flow_offload_fixup_tcp(&ct->proto.tcp);
+ const struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ u8 tcp_state;
+
+ /* Enter CLOSE state if fin/rst packet has been seen, this
+ * allows TCP reopen from conntrack. Otherwise, pick up from
+ * the last seen TCP state.
+ */
+ closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
+ if (closing) {
+ flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
+ timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
+ expired = false;
+ } else {
+ tcp_state = READ_ONCE(ct->proto.tcp.state);
+ flow_offload_fixup_tcp(ct, tcp_state);
+ timeout = READ_ONCE(tn->timeouts[tcp_state]);
+ expired = nf_flow_has_expired(flow);
+ }
+ offload_timeout = READ_ONCE(tn->offload_timeout);
- timeout = tn->timeouts[ct->proto.tcp.state];
- timeout -= tn->offload_timeout;
} else if (l4num == IPPROTO_UDP) {
- struct nf_udp_net *tn = nf_udp_pernet(net);
-
- timeout = tn->timeouts[UDP_CT_REPLIED];
- timeout -= tn->offload_timeout;
+ const struct nf_udp_net *tn = nf_udp_pernet(net);
+ enum udp_conntrack state =
+ test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
+ UDP_CT_REPLIED : UDP_CT_UNREPLIED;
+
+ timeout = READ_ONCE(tn->timeouts[state]);
+ expired = nf_flow_has_expired(flow);
+ offload_timeout = READ_ONCE(tn->offload_timeout);
} else {
return;
}
+ if (expired)
+ timeout -= offload_timeout;
+
if (timeout < 0)
timeout = 0;
- if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
- WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
+ if (closing ||
+ nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
+ nf_ct_refresh(ct, timeout);
}
static void flow_offload_route_release(struct flow_offload *flow)
@@ -302,7 +341,7 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
return err;
}
- nf_ct_offload_timeout(flow->ct);
+ nf_ct_refresh(flow->ct, NF_CT_DAY);
if (nf_flowtable_hw_offload(flow_table)) {
__set_bit(NF_FLOW_HW, &flow->flags);
@@ -314,28 +353,24 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
EXPORT_SYMBOL_GPL(flow_offload_add);
void flow_offload_refresh(struct nf_flowtable *flow_table,
- struct flow_offload *flow)
+ struct flow_offload *flow, bool force)
{
u32 timeout;
timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
- if (timeout - READ_ONCE(flow->timeout) > HZ)
+ if (force || timeout - READ_ONCE(flow->timeout) > HZ)
WRITE_ONCE(flow->timeout, timeout);
else
return;
- if (likely(!nf_flowtable_hw_offload(flow_table)))
+ if (likely(!nf_flowtable_hw_offload(flow_table)) ||
+ test_bit(NF_FLOW_CLOSING, &flow->flags))
return;
nf_flow_offload_add(flow_table, flow);
}
EXPORT_SYMBOL_GPL(flow_offload_refresh);
-static inline bool nf_flow_has_expired(const struct flow_offload *flow)
-{
- return nf_flow_timeout_delta(flow->timeout) <= 0;
-}
-
static void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
@@ -351,8 +386,8 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
void flow_offload_teardown(struct flow_offload *flow)
{
clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
- set_bit(NF_FLOW_TEARDOWN, &flow->flags);
- flow_offload_fixup_ct(flow->ct);
+ if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags))
+ flow_offload_fixup_ct(flow);
}
EXPORT_SYMBOL_GPL(flow_offload_teardown);
@@ -416,14 +451,124 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
return err;
}
+static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
+ const struct flow_offload *flow)
+{
+ return flow_table->type->gc && flow_table->type->gc(flow);
+}
+
+/**
+ * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
+ * @ct: Flowtable offloaded tcp ct
+ *
+ * Return: number of seconds when ct entry should expire.
+ */
+static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
+{
+ u8 state = READ_ONCE(ct->proto.tcp.state);
+
+ switch (state) {
+ case TCP_CONNTRACK_SYN_SENT:
+ case TCP_CONNTRACK_SYN_RECV:
+ return 0;
+ case TCP_CONNTRACK_ESTABLISHED:
+ return NF_CT_DAY;
+ case TCP_CONNTRACK_FIN_WAIT:
+ case TCP_CONNTRACK_CLOSE_WAIT:
+ case TCP_CONNTRACK_LAST_ACK:
+ case TCP_CONNTRACK_TIME_WAIT:
+ return 5 * 60 * HZ;
+ case TCP_CONNTRACK_CLOSE:
+ return 0;
+ }
+
+ return 0;
+}
+
+/**
+ * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
+ * @ct: Flowtable offloaded ct
+ *
+ * Datapath lookups in the conntrack table will evict nf_conn entries
+ * if they have expired.
+ *
+ * Once nf_conn entries have been offloaded, nf_conntrack might not see any
+ * packets anymore. Thus ct->timeout is no longer refreshed and ct can
+ * be evicted.
+ *
+ * To avoid the need for an additional check on the offload bit for every
+ * packet processed via nf_conntrack_in(), set an arbitrary timeout large
+ * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
+ * from the packet path via nf_ct_is_expired().
+ */
+static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
+{
+ static const u32 min_timeout = 5 * 60 * HZ;
+ u32 expires = nf_ct_expires(ct);
+
+ /* normal case: large enough timeout, nothing to do. */
+ if (likely(expires >= min_timeout))
+ return;
+
+ /* must check offload bit after this, we do not hold any locks.
+ * flowtable and ct entries could have been removed on another CPU.
+ */
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
+ return;
+
+ /* load ct->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
+ if (nf_ct_is_confirmed(ct) &&
+ test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+ u8 l4proto = nf_ct_protonum(ct);
+ u32 new_timeout = true;
+
+ switch (l4proto) {
+ case IPPROTO_UDP:
+ new_timeout = NF_CT_DAY;
+ break;
+ case IPPROTO_TCP:
+ new_timeout = nf_flow_table_tcp_timeout(ct);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ /* Update to ct->timeout from nf_conntrack happens
+ * without holding ct->lock.
+ *
+ * Use cmpxchg to ensure timeout extension doesn't
+ * happen when we race with conntrack datapath.
+ *
+ * The inverse -- datapath updating ->timeout right
+ * after this -- is fine, datapath is authoritative.
+ */
+ if (new_timeout) {
+ new_timeout += nfct_time_stamp;
+ cmpxchg(&ct->timeout, expires, new_timeout);
+ }
+ }
+
+ nf_ct_put(ct);
+}
+
static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
struct flow_offload *flow, void *data)
{
+ bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
+
if (nf_flow_has_expired(flow) ||
- nf_ct_is_dying(flow->ct))
+ nf_ct_is_dying(flow->ct) ||
+ nf_flow_custom_gc(flow_table, flow)) {
flow_offload_teardown(flow);
+ teardown = true;
+ } else if (!teardown) {
+ nf_flow_table_extend_ct_timeout(flow->ct);
+ }
- if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
+ if (teardown) {
if (test_bit(NF_FLOW_HW, &flow->flags)) {
if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
nf_flow_offload_del(flow_table, flow);
@@ -432,6 +577,10 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
} else {
flow_offload_del(flow_table, flow);
}
+ } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
+ test_bit(NF_FLOW_HW, &flow->flags) &&
+ !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
+ nf_flow_offload_del(flow_table, flow);
} else if (test_bit(NF_FLOW_HW, &flow->flags)) {
nf_flow_offload_stats(flow_table, flow);
}
@@ -671,8 +820,14 @@ static int __init nf_flow_table_module_init(void)
if (ret)
goto out_offload;
+ ret = nf_flow_register_bpf();
+ if (ret)
+ goto out_bpf;
+
return 0;
+out_bpf:
+ nf_flow_table_offload_exit();
out_offload:
unregister_pernet_subsys(&nf_flow_table_net_ops);
return ret;
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 0ccabf3fa6aa..b0f199171932 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -17,11 +17,15 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
switch (skb->protocol) {
case htons(ETH_P_8021Q):
+ if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth)))
+ return NF_ACCEPT;
+
veth = (struct vlan_ethhdr *)skb_mac_header(skb);
proto = veth->h_vlan_encapsulated_proto;
break;
case htons(ETH_P_PPP_SES):
- proto = nf_flow_pppoe_proto(skb);
+ if (!nf_flow_pppoe_proto(skb, &proto))
+ return NF_ACCEPT;
break;
default:
proto = skb->protocol;
@@ -39,7 +43,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
}
static int nf_flow_rule_route_inet(struct net *net,
- const struct flow_offload *flow,
+ struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 19efba1e51ef..78883343e5d6 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -8,6 +8,7 @@
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <linux/if_ether.h>
+#include <net/gso.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
@@ -27,11 +28,15 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
return 0;
tcph = (void *)(skb_network_header(skb) + thoff);
- if (unlikely(tcph->fin || tcph->rst)) {
+ if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) {
flow_offload_teardown(flow);
return -1;
}
+ if ((tcph->fin || tcph->rst) &&
+ !test_bit(NF_FLOW_CLOSING, &flow->flags))
+ set_bit(NF_FLOW_CLOSING, &flow->flags);
+
return 0;
}
@@ -140,8 +145,11 @@ static bool ip_has_options(unsigned int thoff)
static void nf_flow_tuple_encap(struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
+ __be16 inner_proto = skb->protocol;
struct vlan_ethhdr *veth;
struct pppoe_hdr *phdr;
+ struct iphdr *iph;
+ u16 offset = 0;
int i = 0;
if (skb_vlan_tag_present(skb)) {
@@ -154,47 +162,65 @@ static void nf_flow_tuple_encap(struct sk_buff *skb,
veth = (struct vlan_ethhdr *)skb_mac_header(skb);
tuple->encap[i].id = ntohs(veth->h_vlan_TCI);
tuple->encap[i].proto = skb->protocol;
+ inner_proto = veth->h_vlan_encapsulated_proto;
+ offset += VLAN_HLEN;
break;
case htons(ETH_P_PPP_SES):
- phdr = (struct pppoe_hdr *)skb_mac_header(skb);
+ phdr = (struct pppoe_hdr *)skb_network_header(skb);
tuple->encap[i].id = ntohs(phdr->sid);
tuple->encap[i].proto = skb->protocol;
+ inner_proto = *((__be16 *)(phdr + 1));
+ offset += PPPOE_SES_HLEN;
break;
}
+
+ if (inner_proto == htons(ETH_P_IP)) {
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ if (iph->protocol == IPPROTO_IPIP) {
+ tuple->tun.dst_v4.s_addr = iph->daddr;
+ tuple->tun.src_v4.s_addr = iph->saddr;
+ tuple->tun.l3_proto = IPPROTO_IPIP;
+ }
+ }
}
-static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple, u32 *hdrsize,
- u32 offset)
+struct nf_flowtable_ctx {
+ const struct net_device *in;
+ u32 offset;
+ u32 hdrsize;
+};
+
+static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
{
struct flow_ports *ports;
unsigned int thoff;
struct iphdr *iph;
u8 ipproto;
- if (!pskb_may_pull(skb, sizeof(*iph) + offset))
+ if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset))
return -1;
- iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
thoff = (iph->ihl * 4);
if (ip_is_fragment(iph) ||
unlikely(ip_has_options(thoff)))
return -1;
- thoff += offset;
+ thoff += ctx->offset;
ipproto = iph->protocol;
switch (ipproto) {
case IPPROTO_TCP:
- *hdrsize = sizeof(struct tcphdr);
+ ctx->hdrsize = sizeof(struct tcphdr);
break;
case IPPROTO_UDP:
- *hdrsize = sizeof(struct udphdr);
+ ctx->hdrsize = sizeof(struct udphdr);
break;
#ifdef CONFIG_NF_CT_PROTO_GRE
case IPPROTO_GRE:
- *hdrsize = sizeof(struct gre_base_hdr);
+ ctx->hdrsize = sizeof(struct gre_base_hdr);
break;
#endif
default:
@@ -204,7 +230,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
if (iph->ttl <= 1)
return -1;
- if (!pskb_may_pull(skb, thoff + *hdrsize))
+ if (!pskb_may_pull(skb, thoff + ctx->hdrsize))
return -1;
switch (ipproto) {
@@ -224,13 +250,13 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
}
}
- iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
tuple->src_v4.s_addr = iph->saddr;
tuple->dst_v4.s_addr = iph->daddr;
tuple->l3proto = AF_INET;
tuple->l4proto = ipproto;
- tuple->iifidx = dev->ifindex;
+ tuple->iifidx = ctx->in->ifindex;
nf_flow_tuple_encap(skb, tuple);
return 0;
@@ -267,28 +293,72 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
return NF_STOLEN;
}
-static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto,
+static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize)
+{
+ struct iphdr *iph;
+ u16 size;
+
+ if (!pskb_may_pull(skb, sizeof(*iph) + *psize))
+ return false;
+
+ iph = (struct iphdr *)(skb_network_header(skb) + *psize);
+ size = iph->ihl << 2;
+
+ if (ip_is_fragment(iph) || unlikely(ip_has_options(size)))
+ return false;
+
+ if (iph->ttl <= 1)
+ return false;
+
+ if (iph->protocol == IPPROTO_IPIP)
+ *psize += size;
+
+ return true;
+}
+
+static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb)
+{
+ struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
+
+ if (iph->protocol != IPPROTO_IPIP)
+ return;
+
+ skb_pull(skb, iph->ihl << 2);
+ skb_reset_network_header(skb);
+}
+
+static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
u32 *offset)
{
+ __be16 inner_proto = skb->protocol;
struct vlan_ethhdr *veth;
+ bool ret = false;
switch (skb->protocol) {
case htons(ETH_P_8021Q):
+ if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth)))
+ return false;
+
veth = (struct vlan_ethhdr *)skb_mac_header(skb);
if (veth->h_vlan_encapsulated_proto == proto) {
*offset += VLAN_HLEN;
- return true;
+ inner_proto = proto;
+ ret = true;
}
break;
case htons(ETH_P_PPP_SES):
- if (nf_flow_pppoe_proto(skb) == proto) {
+ if (nf_flow_pppoe_proto(skb, &inner_proto) &&
+ inner_proto == proto) {
*offset += PPPOE_SES_HLEN;
- return true;
+ ret = true;
}
break;
}
- return false;
+ if (inner_proto == htons(ETH_P_IP))
+ ret = nf_flow_ip4_tunnel_proto(skb, offset);
+
+ return ret;
}
static void nf_flow_encap_pop(struct sk_buff *skb,
@@ -310,84 +380,86 @@ static void nf_flow_encap_pop(struct sk_buff *skb,
skb_reset_network_header(skb);
break;
case htons(ETH_P_PPP_SES):
- skb->protocol = nf_flow_pppoe_proto(skb);
+ skb->protocol = __nf_flow_pppoe_proto(skb);
skb_pull(skb, PPPOE_SES_HLEN);
skb_reset_network_header(skb);
break;
}
}
+
+ if (skb->protocol == htons(ETH_P_IP))
+ nf_flow_ip4_tunnel_pop(skb);
}
+struct nf_flow_xmit {
+ const void *dest;
+ const void *source;
+ struct net_device *outdev;
+};
+
static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
- const struct flow_offload_tuple_rhash *tuplehash,
- unsigned short type)
+ struct nf_flow_xmit *xmit)
{
- struct net_device *outdev;
-
- outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx);
- if (!outdev)
- return NF_DROP;
-
- skb->dev = outdev;
- dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest,
- tuplehash->tuple.out.h_source, skb->len);
+ skb->dev = xmit->outdev;
+ dev_hard_header(skb, skb->dev, ntohs(skb->protocol),
+ xmit->dest, xmit->source, skb->len);
dev_queue_xmit(skb);
return NF_STOLEN;
}
-unsigned int
-nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
+static struct flow_offload_tuple_rhash *
+nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table, struct sk_buff *skb)
{
- struct flow_offload_tuple_rhash *tuplehash;
- struct nf_flowtable *flow_table = priv;
struct flow_offload_tuple tuple = {};
+
+ if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
+ return NULL;
+
+ if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)
+ return NULL;
+
+ return flow_offload_lookup(flow_table, &tuple);
+}
+
+static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table,
+ struct flow_offload_tuple_rhash *tuplehash,
+ struct sk_buff *skb)
+{
enum flow_offload_tuple_dir dir;
struct flow_offload *flow;
- struct net_device *outdev;
- u32 hdrsize, offset = 0;
unsigned int thoff, mtu;
- struct rtable *rt;
struct iphdr *iph;
- __be32 nexthop;
- int ret;
-
- if (skb->protocol != htons(ETH_P_IP) &&
- !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &offset))
- return NF_ACCEPT;
-
- if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize, offset) < 0)
- return NF_ACCEPT;
-
- tuplehash = flow_offload_lookup(flow_table, &tuple);
- if (tuplehash == NULL)
- return NF_ACCEPT;
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- mtu = flow->tuplehash[dir].tuple.mtu + offset;
+ mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
+ if (flow->tuplehash[!dir].tuple.tun_num)
+ mtu -= sizeof(*iph);
+
if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
- return NF_ACCEPT;
+ return 0;
- iph = (struct iphdr *)(skb_network_header(skb) + offset);
- thoff = (iph->ihl * 4) + offset;
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
+ thoff = (iph->ihl * 4) + ctx->offset;
if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
- return NF_ACCEPT;
+ return 0;
if (!nf_flow_dst_check(&tuplehash->tuple)) {
flow_offload_teardown(flow);
- return NF_ACCEPT;
+ return 0;
}
- if (skb_try_make_writable(skb, thoff + hdrsize))
- return NF_DROP;
+ if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ return -1;
- flow_offload_refresh(flow_table, flow);
+ flow_offload_refresh(flow_table, flow, false);
nf_flow_encap_pop(skb, tuplehash);
- thoff -= offset;
+ thoff -= ctx->offset;
iph = ip_hdr(skb);
nf_flow_nat_ip(flow, skb, thoff, dir, iph);
@@ -398,36 +470,204 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
+ return 1;
+}
+
+static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id)
+{
+ int data_len = skb->len + sizeof(__be16);
+ struct ppp_hdr {
+ struct pppoe_hdr hdr;
+ __be16 proto;
+ } *ph;
+ __be16 proto;
+
+ if (skb_cow_head(skb, PPPOE_SES_HLEN))
+ return -1;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ proto = htons(PPP_IP);
+ break;
+ case htons(ETH_P_IPV6):
+ proto = htons(PPP_IPV6);
+ break;
+ default:
+ return -1;
+ }
+
+ __skb_push(skb, PPPOE_SES_HLEN);
+ skb_reset_network_header(skb);
+
+ ph = (struct ppp_hdr *)(skb->data);
+ ph->hdr.ver = 1;
+ ph->hdr.type = 1;
+ ph->hdr.code = 0;
+ ph->hdr.sid = htons(id);
+ ph->hdr.length = htons(data_len);
+ ph->proto = proto;
+ skb->protocol = htons(ETH_P_PPP_SES);
+
+ return 0;
+}
+
+static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ __be32 *ip_daddr)
+{
+ struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
+ struct rtable *rt = dst_rtable(tuple->dst_cache);
+ u8 tos = iph->tos, ttl = iph->ttl;
+ __be16 frag_off = iph->frag_off;
+ u32 headroom = sizeof(*iph);
+ int err;
+
+ err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4);
+ if (err)
+ return err;
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+ headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ err = skb_cow_head(skb, headroom);
+ if (err)
+ return err;
+
+ skb_scrub_packet(skb, true);
+ skb_clear_hash_if_not_l4(skb);
+
+ /* Push down and install the IP header. */
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) >> 2;
+ iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off;
+ iph->protocol = tuple->tun.l3_proto;
+ iph->tos = tos;
+ iph->daddr = tuple->tun.src_v4.s_addr;
+ iph->saddr = tuple->tun.dst_v4.s_addr;
+ iph->ttl = ttl;
+ iph->tot_len = htons(skb->len);
+ __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
+ ip_send_check(iph);
+
+ *ip_daddr = tuple->tun.src_v4.s_addr;
+
+ return 0;
+}
+
+static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ __be32 *ip_daddr)
+{
+ if (tuple->tun_num)
+ return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr);
+
+ return 0;
+}
+
+static int nf_flow_encap_push(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
+{
+ int i;
+
+ for (i = 0; i < tuple->encap_num; i++) {
+ switch (tuple->encap[i].proto) {
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ if (skb_vlan_push(skb, tuple->encap[i].proto,
+ tuple->encap[i].id) < 0)
+ return -1;
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0)
+ return -1;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+unsigned int
+nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple *other_tuple;
+ enum flow_offload_tuple_dir dir;
+ struct nf_flowtable_ctx ctx = {
+ .in = state->in,
+ };
+ struct nf_flow_xmit xmit = {};
+ struct flow_offload *flow;
+ struct neighbour *neigh;
+ struct rtable *rt;
+ __be32 ip_daddr;
+ int ret;
+
+ tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb);
+ if (!tuplehash)
+ return NF_ACCEPT;
+
+ ret = nf_flow_offload_forward(&ctx, flow_table, tuplehash, skb);
+ if (ret < 0)
+ return NF_DROP;
+ else if (ret == 0)
+ return NF_ACCEPT;
+
if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
- rt = (struct rtable *)tuplehash->tuple.dst_cache;
+ rt = dst_rtable(tuplehash->tuple.dst_cache);
memset(skb->cb, 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->dev->ifindex;
IPCB(skb)->flags = IPSKB_FORWARDED;
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ip_daddr = other_tuple->src_v4.s_addr;
+
+ if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0)
+ return NF_DROP;
+
+ if (nf_flow_encap_push(skb, other_tuple) < 0)
+ return NF_DROP;
+
switch (tuplehash->tuple.xmit_type) {
case FLOW_OFFLOAD_XMIT_NEIGH:
- rt = (struct rtable *)tuplehash->tuple.dst_cache;
- outdev = rt->dst.dev;
- skb->dev = outdev;
- nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+ rt = dst_rtable(tuplehash->tuple.dst_cache);
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx);
+ if (!xmit.outdev) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr));
+ if (IS_ERR(neigh)) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = neigh->ha;
skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
- ret = NF_STOLEN;
break;
case FLOW_OFFLOAD_XMIT_DIRECT:
- ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP);
- if (ret == NF_DROP)
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx);
+ if (!xmit.outdev) {
flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = tuplehash->tuple.out.h_dest;
+ xmit.source = tuplehash->tuple.out.h_source;
break;
default:
WARN_ON_ONCE(1);
- ret = NF_DROP;
- break;
+ return NF_DROP;
}
- return ret;
+ return nf_flow_queue_xmit(state->net, skb, &xmit);
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
@@ -535,32 +775,31 @@ static void nf_flow_nat_ipv6(const struct flow_offload *flow,
}
}
-static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple, u32 *hdrsize,
- u32 offset)
+static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
{
struct flow_ports *ports;
struct ipv6hdr *ip6h;
unsigned int thoff;
u8 nexthdr;
- thoff = sizeof(*ip6h) + offset;
+ thoff = sizeof(*ip6h) + ctx->offset;
if (!pskb_may_pull(skb, thoff))
return -1;
- ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
nexthdr = ip6h->nexthdr;
switch (nexthdr) {
case IPPROTO_TCP:
- *hdrsize = sizeof(struct tcphdr);
+ ctx->hdrsize = sizeof(struct tcphdr);
break;
case IPPROTO_UDP:
- *hdrsize = sizeof(struct udphdr);
+ ctx->hdrsize = sizeof(struct udphdr);
break;
#ifdef CONFIG_NF_CT_PROTO_GRE
case IPPROTO_GRE:
- *hdrsize = sizeof(struct gre_base_hdr);
+ ctx->hdrsize = sizeof(struct gre_base_hdr);
break;
#endif
default:
@@ -570,7 +809,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
if (ip6h->hop_limit <= 1)
return -1;
- if (!pskb_may_pull(skb, thoff + *hdrsize))
+ if (!pskb_may_pull(skb, thoff + ctx->hdrsize))
return -1;
switch (nexthdr) {
@@ -590,67 +829,49 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
}
}
- ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
tuple->src_v6 = ip6h->saddr;
tuple->dst_v6 = ip6h->daddr;
tuple->l3proto = AF_INET6;
tuple->l4proto = nexthdr;
- tuple->iifidx = dev->ifindex;
+ tuple->iifidx = ctx->in->ifindex;
nf_flow_tuple_encap(skb, tuple);
return 0;
}
-unsigned int
-nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
+static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table,
+ struct flow_offload_tuple_rhash *tuplehash,
+ struct sk_buff *skb)
{
- struct flow_offload_tuple_rhash *tuplehash;
- struct nf_flowtable *flow_table = priv;
- struct flow_offload_tuple tuple = {};
enum flow_offload_tuple_dir dir;
- const struct in6_addr *nexthop;
struct flow_offload *flow;
- struct net_device *outdev;
unsigned int thoff, mtu;
- u32 hdrsize, offset = 0;
struct ipv6hdr *ip6h;
- struct rt6_info *rt;
- int ret;
-
- if (skb->protocol != htons(ETH_P_IPV6) &&
- !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &offset))
- return NF_ACCEPT;
-
- if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize, offset) < 0)
- return NF_ACCEPT;
-
- tuplehash = flow_offload_lookup(flow_table, &tuple);
- if (tuplehash == NULL)
- return NF_ACCEPT;
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- mtu = flow->tuplehash[dir].tuple.mtu + offset;
+ mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
- return NF_ACCEPT;
+ return 0;
- ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
- thoff = sizeof(*ip6h) + offset;
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
+ thoff = sizeof(*ip6h) + ctx->offset;
if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff))
- return NF_ACCEPT;
+ return 0;
if (!nf_flow_dst_check(&tuplehash->tuple)) {
flow_offload_teardown(flow);
- return NF_ACCEPT;
+ return 0;
}
- if (skb_try_make_writable(skb, thoff + hdrsize))
- return NF_DROP;
+ if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ return -1;
- flow_offload_refresh(flow_table, flow);
+ flow_offload_refresh(flow_table, flow, false);
nf_flow_encap_pop(skb, tuplehash);
@@ -663,35 +884,100 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
+ return 1;
+}
+
+static struct flow_offload_tuple_rhash *
+nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table,
+ struct sk_buff *skb)
+{
+ struct flow_offload_tuple tuple = {};
+
+ if (skb->protocol != htons(ETH_P_IPV6) &&
+ !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset))
+ return NULL;
+
+ if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0)
+ return NULL;
+
+ return flow_offload_lookup(flow_table, &tuple);
+}
+
+unsigned int
+nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple *other_tuple;
+ enum flow_offload_tuple_dir dir;
+ struct nf_flowtable_ctx ctx = {
+ .in = state->in,
+ };
+ struct nf_flow_xmit xmit = {};
+ struct in6_addr *ip6_daddr;
+ struct flow_offload *flow;
+ struct neighbour *neigh;
+ struct rt6_info *rt;
+ int ret;
+
+ tuplehash = nf_flow_offload_ipv6_lookup(&ctx, flow_table, skb);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb);
+ if (ret < 0)
+ return NF_DROP;
+ else if (ret == 0)
+ return NF_ACCEPT;
+
if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
- rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
+ rt = dst_rt6_info(tuplehash->tuple.dst_cache);
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
IP6CB(skb)->iif = skb->dev->ifindex;
IP6CB(skb)->flags = IP6SKB_FORWARDED;
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ip6_daddr = &other_tuple->src_v6;
+
+ if (nf_flow_encap_push(skb, other_tuple) < 0)
+ return NF_DROP;
+
switch (tuplehash->tuple.xmit_type) {
case FLOW_OFFLOAD_XMIT_NEIGH:
- rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
- outdev = rt->dst.dev;
- skb->dev = outdev;
- nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+ rt = dst_rt6_info(tuplehash->tuple.dst_cache);
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx);
+ if (!xmit.outdev) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr));
+ if (IS_ERR(neigh)) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = neigh->ha;
skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
- ret = NF_STOLEN;
break;
case FLOW_OFFLOAD_XMIT_DIRECT:
- ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6);
- if (ret == NF_DROP)
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx);
+ if (!xmit.outdev) {
flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = tuplehash->tuple.out.h_dest;
+ xmit.source = tuplehash->tuple.out.h_source;
break;
default:
WARN_ON_ONCE(1);
- ret = NF_DROP;
- break;
+ return NF_DROP;
}
- return ret;
+ return nf_flow_queue_xmit(state->net, skb, &xmit);
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 4d9b99abe37d..d8f7bfd60ac6 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -34,7 +34,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
{
struct nf_flow_key *mask = &match->mask;
struct nf_flow_key *key = &match->key;
- unsigned int enc_keys;
+ unsigned long long enc_keys;
if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX))
return;
@@ -43,8 +43,8 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id);
mask->enc_key_id.keyid = 0xffffffff;
- enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
- BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL);
+ enc_keys = BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) |
+ BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL);
if (ip_tunnel_info_af(tun_info) == AF_INET) {
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
@@ -55,7 +55,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
mask->enc_ipv4.src = 0xffffffff;
if (key->enc_ipv4.dst)
mask->enc_ipv4.dst = 0xffffffff;
- enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS);
+ enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS);
key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
} else {
memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst,
@@ -70,7 +70,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
sizeof(struct in6_addr)))
memset(&mask->enc_ipv6.dst, 0xff,
sizeof(struct in6_addr));
- enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS);
+ enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS);
key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
@@ -163,14 +163,14 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
return -EOPNOTSUPP;
}
mask->control.addr_type = 0xffff;
- match->dissector.used_keys |= BIT(key->control.addr_type);
+ match->dissector.used_keys |= BIT_ULL(key->control.addr_type);
mask->basic.n_proto = 0xffff;
switch (tuple->l4proto) {
case IPPROTO_TCP:
key->tcp.flags = 0;
mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16);
- match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_TCP);
break;
case IPPROTO_UDP:
case IPPROTO_GRE:
@@ -182,9 +182,9 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
key->basic.ip_proto = tuple->l4proto;
mask->basic.ip_proto = 0xff;
- match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) |
- BIT(FLOW_DISSECTOR_KEY_CONTROL) |
- BIT(FLOW_DISSECTOR_KEY_BASIC);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_META) |
+ BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) |
+ BIT_ULL(FLOW_DISSECTOR_KEY_BASIC);
switch (tuple->l4proto) {
case IPPROTO_TCP:
@@ -194,7 +194,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
key->tp.dst = tuple->dst_port;
mask->tp.dst = 0xffff;
- match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_PORTS);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS);
break;
}
@@ -555,7 +555,7 @@ static void flow_offload_redirect(struct net *net,
switch (this_tuple->xmit_type) {
case FLOW_OFFLOAD_XMIT_DIRECT:
this_tuple = &flow->tuplehash[dir].tuple;
- ifindex = this_tuple->out.hw_ifidx;
+ ifindex = this_tuple->out.ifidx;
break;
case FLOW_OFFLOAD_XMIT_NEIGH:
other_tuple = &flow->tuplehash[!dir].tuple;
@@ -679,7 +679,7 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
return 0;
}
-int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
+int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
@@ -704,7 +704,7 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
}
EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4);
-int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
+int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
@@ -735,7 +735,7 @@ nf_flow_offload_rule_alloc(struct net *net,
{
const struct nf_flowtable *flowtable = offload->flowtable;
const struct flow_offload_tuple *tuple, *other_tuple;
- const struct flow_offload *flow = offload->flow;
+ struct flow_offload *flow = offload->flow;
struct dst_entry *other_dst = NULL;
struct nf_flow_rule *flow_rule;
int err = -ENOMEM;
@@ -841,8 +841,8 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
struct list_head *block_cb_list)
{
struct flow_cls_offload cls_flow = {};
+ struct netlink_ext_ack extack = {};
struct flow_block_cb *block_cb;
- struct netlink_ext_ack extack;
__be16 proto = ETH_P_ALL;
int err, i = 0;
@@ -895,8 +895,9 @@ static int flow_offload_rule_add(struct flow_offload_work *offload,
ok_count += flow_offload_tuple_add(offload, flow_rule[0],
FLOW_OFFLOAD_DIR_ORIGINAL);
- ok_count += flow_offload_tuple_add(offload, flow_rule[1],
- FLOW_OFFLOAD_DIR_REPLY);
+ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
+ ok_count += flow_offload_tuple_add(offload, flow_rule[1],
+ FLOW_OFFLOAD_DIR_REPLY);
if (ok_count == 0)
return -ENOENT;
@@ -926,7 +927,8 @@ static void flow_offload_work_del(struct flow_offload_work *offload)
{
clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
- flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
+ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
+ flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags);
}
@@ -946,7 +948,9 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
u64 lastused;
flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]);
- flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]);
+ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
+ flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY,
+ &stats[1]);
lastused = max_t(u64, stats[0].lastused, stats[1].lastused);
offload->flow->timeout = max_t(u64, offload->flow->timeout,
@@ -1188,7 +1192,7 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
int err;
if (!nf_flowtable_hw_offload(flowtable))
- return 0;
+ return nf_flow_offload_xdp_setup(flowtable, dev, cmd);
if (dev->netdev_ops->ndo_setup_tc)
err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd,
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
new file mode 100644
index 000000000000..f0984cf69a09
--- /dev/null
+++ b/net/netfilter/nf_flow_table_path.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/ip.h>
+#include <net/inet_dscp.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_flow_table.h>
+
+static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst)
+{
+ if (dst_xfrm(dst))
+ return FLOW_OFFLOAD_XMIT_XFRM;
+
+ return FLOW_OFFLOAD_XMIT_NEIGH;
+}
+
+static void nft_default_forward_path(struct nf_flow_route *route,
+ struct dst_entry *dst_cache,
+ enum ip_conntrack_dir dir)
+{
+ route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex;
+ route->tuple[dir].dst = dst_cache;
+ route->tuple[dir].xmit_type = nft_xmit_type(dst_cache);
+}
+
+static bool nft_is_valid_ether_device(const struct net_device *dev)
+{
+ if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
+ return false;
+
+ return true;
+}
+
+static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
+ const struct dst_entry *dst_cache,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir, u8 *ha,
+ struct net_device_path_stack *stack)
+{
+ const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
+ struct net_device *dev = dst_cache->dev;
+ struct neighbour *n;
+ u8 nud_state;
+
+ if (!nft_is_valid_ether_device(dev))
+ goto out;
+
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -1;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+
+ if (!(nud_state & NUD_VALID))
+ return -1;
+
+out:
+ return dev_fill_forward_path(dev, ha, stack);
+}
+
+struct nft_forward_info {
+ const struct net_device *indev;
+ const struct net_device *outdev;
+ struct id {
+ __u16 id;
+ __be16 proto;
+ } encap[NF_FLOW_TABLE_ENCAP_MAX];
+ u8 num_encaps;
+ struct flow_offload_tunnel tun;
+ u8 num_tuns;
+ u8 ingress_vlans;
+ u8 h_source[ETH_ALEN];
+ u8 h_dest[ETH_ALEN];
+ enum flow_offload_xmit_type xmit_type;
+};
+
+static void nft_dev_path_info(const struct net_device_path_stack *stack,
+ struct nft_forward_info *info,
+ unsigned char *ha, struct nf_flowtable *flowtable)
+{
+ const struct net_device_path *path;
+ int i;
+
+ memcpy(info->h_dest, ha, ETH_ALEN);
+
+ for (i = 0; i < stack->num_paths; i++) {
+ path = &stack->path[i];
+ switch (path->type) {
+ case DEV_PATH_ETHERNET:
+ case DEV_PATH_DSA:
+ case DEV_PATH_VLAN:
+ case DEV_PATH_PPPOE:
+ case DEV_PATH_TUN:
+ info->indev = path->dev;
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ if (path->type == DEV_PATH_ETHERNET)
+ break;
+ if (path->type == DEV_PATH_DSA) {
+ i = stack->num_paths;
+ break;
+ }
+
+ /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */
+ if (path->type == DEV_PATH_TUN) {
+ if (info->num_tuns) {
+ info->indev = NULL;
+ break;
+ }
+ info->tun.src_v6 = path->tun.src_v6;
+ info->tun.dst_v6 = path->tun.dst_v6;
+ info->tun.l3_proto = path->tun.l3_proto;
+ info->num_tuns++;
+ } else {
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ info->encap[info->num_encaps].id =
+ path->encap.id;
+ info->encap[info->num_encaps].proto =
+ path->encap.proto;
+ info->num_encaps++;
+ }
+ if (path->type == DEV_PATH_PPPOE)
+ memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+ break;
+ case DEV_PATH_BRIDGE:
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ info->ingress_vlans |= BIT(info->num_encaps - 1);
+ break;
+ case DEV_PATH_BR_VLAN_TAG:
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ info->encap[info->num_encaps].id = path->bridge.vlan_id;
+ info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
+ info->num_encaps++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG:
+ if (WARN_ON_ONCE(info->num_encaps-- == 0)) {
+ info->indev = NULL;
+ break;
+ }
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+ break;
+ default:
+ info->indev = NULL;
+ break;
+ }
+ }
+ info->outdev = info->indev;
+
+ if (nf_flowtable_hw_offload(flowtable) &&
+ nft_is_valid_ether_device(info->indev))
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+}
+
+static bool nft_flowtable_find_dev(const struct net_device *dev,
+ struct nft_flowtable *ft)
+{
+ struct nft_hook *hook;
+ bool found = false;
+
+ list_for_each_entry_rcu(hook, &ft->hook_list, list) {
+ if (!nft_hook_find_ops_rcu(hook, dev))
+ continue;
+
+ found = true;
+ break;
+ }
+
+ return found;
+}
+
+static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt,
+ struct flow_offload_tunnel *tun,
+ struct nf_flow_route *route,
+ enum ip_conntrack_dir dir)
+{
+ struct dst_entry *cur_dst = route->tuple[dir].dst;
+ struct dst_entry *tun_dst = NULL;
+ struct flowi fl = {};
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = tun->dst_v4.s_addr;
+ fl.u.ip4.saddr = tun->src_v4.s_addr;
+ fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex;
+ fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
+ fl.u.ip4.flowi4_mark = pkt->skb->mark;
+ fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.daddr = tun->dst_v6;
+ fl.u.ip6.saddr = tun->src_v6;
+ fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex;
+ fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
+ fl.u.ip6.flowi6_mark = pkt->skb->mark;
+ fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ }
+
+ nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt));
+ if (!tun_dst)
+ return -ENOENT;
+
+ route->tuple[dir].dst = tun_dst;
+ dst_release(cur_dst);
+
+ return 0;
+}
+
+static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
+ struct nf_flow_route *route,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
+{
+ const struct dst_entry *dst = route->tuple[dir].dst;
+ struct net_device_path_stack stack;
+ struct nft_forward_info info = {};
+ unsigned char ha[ETH_ALEN];
+ int i;
+
+ if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
+ nft_dev_path_info(&stack, &info, ha, &ft->data);
+
+ if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
+ return;
+
+ route->tuple[!dir].in.ifindex = info.indev->ifindex;
+ for (i = 0; i < info.num_encaps; i++) {
+ route->tuple[!dir].in.encap[i].id = info.encap[i].id;
+ route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
+ }
+
+ if (info.num_tuns &&
+ !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) {
+ route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6;
+ route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6;
+ route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto;
+ route->tuple[!dir].in.num_tuns = info.num_tuns;
+ }
+
+ route->tuple[!dir].in.num_encaps = info.num_encaps;
+ route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
+ route->tuple[dir].out.ifindex = info.outdev->ifindex;
+
+ if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
+ memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
+ memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
+ route->tuple[dir].xmit_type = info.xmit_type;
+ }
+}
+
+int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
+ struct nf_flow_route *route, enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
+{
+ struct dst_entry *this_dst = skb_dst(pkt->skb);
+ struct dst_entry *other_dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
+ fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
+ fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
+ fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
+ fl.u.ip4.flowi4_mark = pkt->skb->mark;
+ fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6;
+ fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
+ fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
+ fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
+ fl.u.ip6.flowi6_mark = pkt->skb->mark;
+ fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ }
+
+ if (!dst_hold_safe(this_dst))
+ return -ENOENT;
+
+ nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
+ if (!other_dst) {
+ dst_release(this_dst);
+ return -ENOENT;
+ }
+
+ nft_default_forward_path(route, this_dst, dir);
+ nft_default_forward_path(route, other_dst, !dir);
+
+ if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ nft_dev_forward_path(pkt, route, ct, dir, ft);
+ if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ nft_dev_forward_path(pkt, route, ct, !dir, ft);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_flow_route);
diff --git a/net/netfilter/nf_flow_table_xdp.c b/net/netfilter/nf_flow_table_xdp.c
new file mode 100644
index 000000000000..e1252d042699
--- /dev/null
+++ b/net/netfilter/nf_flow_table_xdp.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_flow_table.h>
+
+struct flow_offload_xdp_ft {
+ struct list_head head;
+ struct nf_flowtable *ft;
+ struct rcu_head rcuhead;
+};
+
+struct flow_offload_xdp {
+ struct hlist_node hnode;
+ unsigned long net_device_addr;
+ struct list_head head;
+};
+
+#define NF_XDP_HT_BITS 4
+static DEFINE_HASHTABLE(nf_xdp_hashtable, NF_XDP_HT_BITS);
+static DEFINE_MUTEX(nf_xdp_hashtable_lock);
+
+/* caller must hold rcu read lock */
+struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev)
+{
+ unsigned long key = (unsigned long)dev;
+ struct flow_offload_xdp *iter;
+
+ hash_for_each_possible_rcu(nf_xdp_hashtable, iter, hnode, key) {
+ if (key == iter->net_device_addr) {
+ struct flow_offload_xdp_ft *ft_elem;
+
+ /* The user is supposed to insert a given net_device
+ * just into a single nf_flowtable so we always return
+ * the first element here.
+ */
+ ft_elem = list_first_or_null_rcu(&iter->head,
+ struct flow_offload_xdp_ft,
+ head);
+ return ft_elem ? ft_elem->ft : NULL;
+ }
+ }
+
+ return NULL;
+}
+
+static int nf_flowtable_by_dev_insert(struct nf_flowtable *ft,
+ const struct net_device *dev)
+{
+ struct flow_offload_xdp *iter, *elem = NULL;
+ unsigned long key = (unsigned long)dev;
+ struct flow_offload_xdp_ft *ft_elem;
+
+ ft_elem = kzalloc(sizeof(*ft_elem), GFP_KERNEL_ACCOUNT);
+ if (!ft_elem)
+ return -ENOMEM;
+
+ ft_elem->ft = ft;
+
+ mutex_lock(&nf_xdp_hashtable_lock);
+
+ hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) {
+ if (key == iter->net_device_addr) {
+ elem = iter;
+ break;
+ }
+ }
+
+ if (!elem) {
+ elem = kzalloc(sizeof(*elem), GFP_KERNEL_ACCOUNT);
+ if (!elem)
+ goto err_unlock;
+
+ elem->net_device_addr = key;
+ INIT_LIST_HEAD(&elem->head);
+ hash_add_rcu(nf_xdp_hashtable, &elem->hnode, key);
+ }
+ list_add_tail_rcu(&ft_elem->head, &elem->head);
+
+ mutex_unlock(&nf_xdp_hashtable_lock);
+
+ return 0;
+
+err_unlock:
+ mutex_unlock(&nf_xdp_hashtable_lock);
+ kfree(ft_elem);
+
+ return -ENOMEM;
+}
+
+static void nf_flowtable_by_dev_remove(struct nf_flowtable *ft,
+ const struct net_device *dev)
+{
+ struct flow_offload_xdp *iter, *elem = NULL;
+ unsigned long key = (unsigned long)dev;
+
+ mutex_lock(&nf_xdp_hashtable_lock);
+
+ hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) {
+ if (key == iter->net_device_addr) {
+ elem = iter;
+ break;
+ }
+ }
+
+ if (elem) {
+ struct flow_offload_xdp_ft *ft_elem, *ft_next;
+
+ list_for_each_entry_safe(ft_elem, ft_next, &elem->head, head) {
+ if (ft_elem->ft == ft) {
+ list_del_rcu(&ft_elem->head);
+ kfree_rcu(ft_elem, rcuhead);
+ }
+ }
+
+ if (list_empty(&elem->head))
+ hash_del_rcu(&elem->hnode);
+ else
+ elem = NULL;
+ }
+
+ mutex_unlock(&nf_xdp_hashtable_lock);
+
+ if (elem) {
+ synchronize_rcu();
+ kfree(elem);
+ }
+}
+
+int nf_flow_offload_xdp_setup(struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ switch (cmd) {
+ case FLOW_BLOCK_BIND:
+ return nf_flowtable_by_dev_insert(flowtable, dev);
+ case FLOW_BLOCK_UNBIND:
+ nf_flowtable_by_dev_remove(flowtable, dev);
+ return 0;
+ }
+
+ WARN_ON_ONCE(1);
+ return 0;
+}
diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c
index 00e89ffd78f6..2d890dd04ff8 100644
--- a/net/netfilter/nf_hooks_lwtunnel.c
+++ b/net/netfilter/nf_hooks_lwtunnel.c
@@ -3,6 +3,9 @@
#include <linux/sysctl.h>
#include <net/lwtunnel.h>
#include <net/netfilter/nf_hooks_lwtunnel.h>
+#include <linux/netfilter.h>
+
+#include "nf_internals.h"
static inline int nf_hooks_lwtunnel_get(void)
{
@@ -25,7 +28,7 @@ static inline int nf_hooks_lwtunnel_set(int enable)
}
#ifdef CONFIG_SYSCTL
-int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write,
+int nf_hooks_lwtunnel_sysctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int proc_nf_hooks_lwtunnel_enabled = 0;
@@ -50,4 +53,71 @@ int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write,
return ret;
}
EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler);
+
+static struct ctl_table nf_lwtunnel_sysctl_table[] = {
+ {
+ .procname = "nf_hooks_lwtunnel",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = nf_hooks_lwtunnel_sysctl_handler,
+ },
+};
+
+static int __net_init nf_lwtunnel_net_init(struct net *net)
+{
+ struct ctl_table_header *hdr;
+ struct ctl_table *table;
+
+ table = nf_lwtunnel_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(nf_lwtunnel_sysctl_table,
+ sizeof(nf_lwtunnel_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ }
+
+ hdr = register_net_sysctl_sz(net, "net/netfilter", table,
+ ARRAY_SIZE(nf_lwtunnel_sysctl_table));
+ if (!hdr)
+ goto err_reg;
+
+ net->nf.nf_lwtnl_dir_header = hdr;
+
+ return 0;
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit nf_lwtunnel_net_exit(struct net *net)
+{
+ const struct ctl_table *table;
+
+ table = net->nf.nf_lwtnl_dir_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct pernet_operations nf_lwtunnel_net_ops = {
+ .init = nf_lwtunnel_net_init,
+ .exit = nf_lwtunnel_net_exit,
+};
+
+int __init netfilter_lwtunnel_init(void)
+{
+ return register_pernet_subsys(&nf_lwtunnel_net_ops);
+}
+
+void netfilter_lwtunnel_fini(void)
+{
+ unregister_pernet_subsys(&nf_lwtunnel_net_ops);
+}
+#else
+int __init netfilter_lwtunnel_init(void) { return 0; }
+void netfilter_lwtunnel_fini(void) {}
#endif /* CONFIG_SYSCTL */
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 832ae64179f0..25403023060b 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -29,6 +29,12 @@ void nf_queue_nf_hook_drop(struct net *net);
/* nf_log.c */
int __init netfilter_log_init(void);
+#ifdef CONFIG_LWTUNNEL
+/* nf_hooks_lwtunnel.c */
+int __init netfilter_lwtunnel_init(void);
+void netfilter_lwtunnel_fini(void);
+#endif
+
/* core.c */
void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
const struct nf_hook_ops *reg);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 8a29290149bd..74cef8bf554c 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -31,10 +31,10 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
int i;
for (i = 0; i < NF_LOG_TYPE_MAX; i++) {
- if (loggers[pf][i] == NULL)
+ log = nft_log_dereference(loggers[pf][i]);
+ if (!log)
continue;
- log = nft_log_dereference(loggers[pf][i]);
if (!strncasecmp(str_logger, log->name, strlen(log->name)))
return log;
}
@@ -125,6 +125,32 @@ void nf_log_unregister(struct nf_logger *logger)
}
EXPORT_SYMBOL(nf_log_unregister);
+/**
+ * nf_log_is_registered - Check if any logger is registered for a given
+ * protocol family.
+ *
+ * @pf: Protocol family
+ *
+ * Returns: true if at least one logger is active for @pf, false otherwise.
+ */
+bool nf_log_is_registered(u_int8_t pf)
+{
+ int i;
+
+ if (pf >= NFPROTO_NUMPROTO) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ for (i = 0; i < NF_LOG_TYPE_MAX; i++) {
+ if (rcu_access_pointer(loggers[pf][i]))
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(nf_log_is_registered);
+
int nf_log_bind_pf(struct net *net, u_int8_t pf,
const struct nf_logger *logger)
{
@@ -156,6 +182,11 @@ int nf_logger_find_get(int pf, enum nf_log_type type)
struct nf_logger *logger;
int ret = -ENOENT;
+ if (pf >= ARRAY_SIZE(loggers))
+ return -EINVAL;
+ if (type >= NF_LOG_TYPE_MAX)
+ return -EINVAL;
+
if (pf == NFPROTO_INET) {
ret = nf_logger_find_get(NFPROTO_IPV4, type);
if (ret < 0)
@@ -193,11 +224,12 @@ void nf_logger_put(int pf, enum nf_log_type type)
return;
}
- BUG_ON(loggers[pf][type] == NULL);
-
rcu_read_lock();
logger = rcu_dereference(loggers[pf][type]);
- module_put(logger->me);
+ if (!logger)
+ WARN_ON_ONCE(1);
+ else
+ module_put(logger->me);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_logger_put);
@@ -389,7 +421,7 @@ static const struct seq_operations nflog_seq_ops = {
#ifdef CONFIG_SYSCTL
static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
-static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
+static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO];
static struct ctl_table_header *nf_log_sysctl_fhdr;
static struct ctl_table nf_log_sysctl_ftable[] = {
@@ -400,10 +432,9 @@ static struct ctl_table nf_log_sysctl_ftable[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
-static int nf_log_proc_dostring(struct ctl_table *table, int write,
+static int nf_log_proc_dostring(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
const struct nf_logger *logger;
@@ -487,9 +518,10 @@ static int netfilter_log_sysctl_init(struct net *net)
for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
table[i].extra2 = net;
- net->nf.nf_log_dir_header = register_net_sysctl(net,
- "net/netfilter/nf_log",
- table);
+ net->nf.nf_log_dir_header = register_net_sysctl_sz(net,
+ "net/netfilter/nf_log",
+ table,
+ ARRAY_SIZE(nf_log_sysctl_table));
if (!net->nf.nf_log_dir_header)
goto err_reg;
@@ -507,7 +539,7 @@ err_alloc:
static void netfilter_log_sysctl_exit(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
table = net->nf.nf_log_dir_header->ctl_table_arg;
unregister_net_sysctl_table(net->nf.nf_log_dir_header);
diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c
index cb894f0d63e9..86d5fc5d28e3 100644
--- a/net/netfilter/nf_log_syslog.c
+++ b/net/netfilter/nf_log_syslog.c
@@ -111,7 +111,8 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf,
unsigned int hooknum, const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
- const struct nf_loginfo *loginfo, const char *prefix)
+ const struct nf_loginfo *loginfo, const char *prefix,
+ struct net *net)
{
const struct net_device *physoutdev __maybe_unused;
const struct net_device *physindev __maybe_unused;
@@ -121,7 +122,7 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf,
in ? in->name : "",
out ? out->name : "");
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- physindev = nf_bridge_get_physindev(skb);
+ physindev = nf_bridge_get_physindev(skb, net);
if (physindev && in != physindev)
nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
physoutdev = nf_bridge_get_physoutdev(skb);
@@ -148,7 +149,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
loginfo = &default_loginfo;
nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
- prefix);
+ prefix, net);
dump_arp_packet(m, loginfo, skb, skb_network_offset(skb));
nf_log_buf_close(m);
@@ -215,7 +216,9 @@ nf_log_dump_tcp_header(struct nf_log_buf *m,
/* Max length: 9 "RES=0x3C " */
nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
TCP_RESERVED_BITS) >> 22));
- /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+ /* Max length: 35 "AE CWR ECE URG ACK PSH RST SYN FIN " */
+ if (th->ae)
+ nf_log_buf_add(m, "AE ");
if (th->cwr)
nf_log_buf_add(m, "CWR ");
if (th->ece)
@@ -322,7 +325,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
- ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+ iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK,
ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
/* Max length: 6 "CE DF MF " */
@@ -515,7 +518,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
/* Proto Max log string length */
/* IP: 40+46+6+11+127 = 230 */
- /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
+ /* TCP: 10+max(25,20+30+13+9+35+11+127) = 255 */
/* UDP: 10+max(25,20) = 35 */
/* UDPLITE: 14+max(25,20) = 39 */
/* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
@@ -525,7 +528,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
/* (ICMP allows recursion one level deep) */
/* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
- /* maxlen = 230+ 91 + 230 + 252 = 803 */
+ /* maxlen = 230+ 91 + 230 + 255 = 806 */
}
static noinline_for_stack void
@@ -845,7 +848,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
loginfo = &default_loginfo;
nf_log_dump_packet_common(m, pf, hooknum, skb, in,
- out, loginfo, prefix);
+ out, loginfo, prefix, net);
if (in)
dump_mac_header(m, loginfo, skb);
@@ -880,7 +883,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
loginfo = &default_loginfo;
nf_log_dump_packet_common(m, pf, hooknum, skb, in, out,
- loginfo, prefix);
+ loginfo, prefix, net);
if (in)
dump_mac_header(m, loginfo, skb);
@@ -916,7 +919,7 @@ static void nf_log_unknown_packet(struct net *net, u_int8_t pf,
loginfo = &default_loginfo;
nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
- prefix);
+ prefix, net);
dump_mac_header(m, loginfo, skb);
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
index 0fa5a0bbb0ff..481be15609b1 100644
--- a/net/netfilter/nf_nat_bpf.c
+++ b/net/netfilter/nf_nat_bpf.c
@@ -12,9 +12,7 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_nat.h>
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
- "Global functions as their definitions will be in nf_nat BTF");
+__bpf_kfunc_start_defs();
/* bpf_ct_set_nat_info - Set source or destination nat address
*
@@ -30,9 +28,9 @@ __diag_ignore_all("-Wmissing-prototypes",
* interpreted as select a random port.
* @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST
*/
-int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
- union nf_inet_addr *addr, int port,
- enum nf_nat_manip_type manip)
+__bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
+ union nf_inet_addr *addr, int port,
+ enum nf_nat_manip_type manip)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
u16 proto = nf_ct_l3num(ct);
@@ -54,11 +52,11 @@ int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
}
-__diag_pop()
+__bpf_kfunc_end_defs();
-BTF_SET8_START(nf_nat_kfunc_set)
+BTF_KFUNCS_START(nf_nat_kfunc_set)
BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
-BTF_SET8_END(nf_nat_kfunc_set)
+BTF_KFUNCS_END(nf_nat_kfunc_set)
static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = {
.owner = THIS_MODULE,
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index e29e4ccb5c5a..78a61dac4ade 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -27,6 +27,9 @@
#include "nf_internals.h"
+#define NF_NAT_MAX_ATTEMPTS 128
+#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4)
+
static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -66,7 +69,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
if (t->dst.protonum == IPPROTO_TCP ||
t->dst.protonum == IPPROTO_UDP ||
t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
t->dst.protonum == IPPROTO_SCTP)
fl4->fl4_dport = t->dst.u.all;
}
@@ -78,7 +80,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
if (t->dst.protonum == IPPROTO_TCP ||
t->dst.protonum == IPPROTO_UDP ||
t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
t->dst.protonum == IPPROTO_SCTP)
fl4->fl4_sport = t->src.u.all;
}
@@ -99,7 +100,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
if (t->dst.protonum == IPPROTO_TCP ||
t->dst.protonum == IPPROTO_UDP ||
t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
t->dst.protonum == IPPROTO_SCTP)
fl6->fl6_dport = t->dst.u.all;
}
@@ -111,7 +111,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
if (t->dst.protonum == IPPROTO_TCP ||
t->dst.protonum == IPPROTO_UDP ||
t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
t->dst.protonum == IPPROTO_SCTP)
fl6->fl6_sport = t->src.u.all;
}
@@ -180,7 +179,35 @@ hash_by_src(const struct net *net,
return reciprocal_scale(hash, nf_nat_htable_size);
}
-/* Is this tuple already taken? (not by us) */
+/**
+ * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry
+ * @tuple: proposed NAT binding
+ * @ignored_conntrack: our (unconfirmed) conntrack entry
+ *
+ * A conntrack entry can be inserted to the connection tracking table
+ * if there is no existing entry with an identical tuple in either direction.
+ *
+ * Example:
+ * INITIATOR -> NAT/PAT -> RESPONDER
+ *
+ * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite).
+ * Then, later, NAT/PAT itself also connects to RESPONDER.
+ *
+ * This will not work if the SNAT done earlier has same IP:PORT source pair.
+ *
+ * Conntrack table has:
+ * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT
+ * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT
+ *
+ * and new locally originating connection wants:
+ * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT
+ * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT
+ *
+ * ... which would mean incoming packets cannot be distinguished between
+ * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple).
+ *
+ * @return: true if the proposed NAT mapping collides with an existing entry.
+ */
static int
nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
const struct nf_conn *ignored_conntrack)
@@ -197,6 +224,182 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
}
+static bool nf_nat_allow_clash(const struct nf_conn *ct)
+{
+ return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash;
+}
+
+/**
+ * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry
+ * @tuple: proposed NAT binding
+ * @ignored_ct: our (unconfirmed) conntrack entry
+ *
+ * Same as nf_nat_used_tuple, but also check for rare clash in reverse
+ * direction. Should be called only when @tuple has not been altered, i.e.
+ * @ignored_conntrack will not be subject to NAT.
+ *
+ * @return: true if the proposed NAT mapping collides with existing entry.
+ */
+static noinline bool
+nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_ct)
+{
+ static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST;
+ const struct nf_conntrack_tuple_hash *thash;
+ const struct nf_conntrack_zone *zone;
+ struct nf_conn *ct;
+ bool taken = true;
+ struct net *net;
+
+ if (!nf_nat_used_tuple(tuple, ignored_ct))
+ return false;
+
+ if (!nf_nat_allow_clash(ignored_ct))
+ return true;
+
+ /* Initial choice clashes with existing conntrack.
+ * Check for (rare) reverse collision.
+ *
+ * This can happen when new packets are received in both directions
+ * at the exact same time on different CPUs.
+ *
+ * Without SMP, first packet creates new conntrack entry and second
+ * packet is resolved as established reply packet.
+ *
+ * With parallel processing, both packets could be picked up as
+ * new and both get their own ct entry allocated.
+ *
+ * If ignored_conntrack and colliding ct are not subject to NAT then
+ * pretend the tuple is available and let later clash resolution
+ * handle this at insertion time.
+ *
+ * Without it, the 'reply' packet has its source port rewritten
+ * by nat engine.
+ */
+ if (READ_ONCE(ignored_ct->status) & uses_nat)
+ return true;
+
+ net = nf_ct_net(ignored_ct);
+ zone = nf_ct_zone(ignored_ct);
+
+ thash = nf_conntrack_find_get(net, zone, tuple);
+ if (unlikely(!thash)) {
+ struct nf_conntrack_tuple reply;
+
+ nf_ct_invert_tuple(&reply, tuple);
+ thash = nf_conntrack_find_get(net, zone, &reply);
+ if (!thash) /* clashing entry went away */
+ return false;
+ }
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+
+ /* NB: IP_CT_DIR_ORIGINAL should be impossible because
+ * nf_nat_used_tuple() handles origin collisions.
+ *
+ * Handle remote chance other CPU confirmed its ct right after.
+ */
+ if (thash->tuple.dst.dir != IP_CT_DIR_REPLY)
+ goto out;
+
+ /* clashing connection subject to NAT? Retry with new tuple. */
+ if (READ_ONCE(ct->status) & uses_nat)
+ goto out;
+
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) &&
+ nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) {
+ taken = false;
+ goto out;
+ }
+out:
+ nf_ct_put(ct);
+ return taken;
+}
+
+static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
+{
+ static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
+ IPS_DYING;
+ static const unsigned long flags_needed = IPS_SRC_NAT;
+ enum tcp_conntrack old_state;
+
+ old_state = READ_ONCE(ct->proto.tcp.state);
+ if (old_state < TCP_CONNTRACK_TIME_WAIT)
+ return false;
+
+ if (flags & flags_refuse)
+ return false;
+
+ return (flags & flags_needed) == flags_needed;
+}
+
+/* reverse direction will send packets to new source, so
+ * make sure such packets are invalid.
+ */
+static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new)
+{
+ return (__s32)(new->proto.tcp.seen[0].td_end -
+ old->proto.tcp.seen[0].td_end) > 0;
+}
+
+static int
+nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack,
+ unsigned int attempts_left)
+{
+ static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD;
+ struct nf_conntrack_tuple_hash *thash;
+ const struct nf_conntrack_zone *zone;
+ struct nf_conntrack_tuple reply;
+ unsigned long flags;
+ struct nf_conn *ct;
+ bool taken = true;
+ struct net *net;
+
+ nf_ct_invert_tuple(&reply, tuple);
+
+ if (attempts_left > NF_NAT_HARDER_THRESH ||
+ tuple->dst.protonum != IPPROTO_TCP ||
+ ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT)
+ return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
+
+ /* :ast few attempts to find a free tcp port. Destructive
+ * action: evict colliding if its in timewait state and the
+ * tcp sequence number has advanced past the one used by the
+ * old entry.
+ */
+ net = nf_ct_net(ignored_conntrack);
+ zone = nf_ct_zone(ignored_conntrack);
+
+ thash = nf_conntrack_find_get(net, zone, &reply);
+ if (!thash)
+ return false;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+
+ if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ if (WARN_ON_ONCE(ct == ignored_conntrack))
+ goto out;
+
+ flags = READ_ONCE(ct->status);
+ if (!nf_nat_may_kill(ct, flags))
+ goto out;
+
+ if (!nf_seq_has_advanced(ct, ignored_conntrack))
+ goto out;
+
+ /* Even if we can evict do not reuse if entry is offloaded. */
+ if (nf_ct_kill(ct))
+ taken = flags & flags_offload;
+out:
+ nf_ct_put(ct);
+ return taken;
+}
+
static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
const struct nf_nat_range2 *range)
{
@@ -225,7 +428,6 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_UDPLITE:
- case IPPROTO_DCCP:
case IPPROTO_SCTP:
if (maniptype == NF_NAT_MANIP_SRC)
port = tuple->src.u.all;
@@ -242,7 +444,7 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
/* If we source map this tuple so reply looks like reply_tuple, will
* that meet the constraints of range.
*/
-static int in_range(const struct nf_conntrack_tuple *tuple,
+static int nf_in_range(const struct nf_conntrack_tuple *tuple,
const struct nf_nat_range2 *range)
{
/* If we are supposed to map IPs, then we must be in the
@@ -291,7 +493,7 @@ find_appropriate_src(struct net *net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst;
- if (in_range(result, range))
+ if (nf_in_range(result, range))
return 1;
}
}
@@ -385,7 +587,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
unsigned int range_size, min, max, i, attempts;
__be16 *keyptr;
u16 off;
- static const unsigned int max_attempts = 128;
switch (tuple->dst.protonum) {
case IPPROTO_ICMP:
@@ -426,7 +627,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
case IPPROTO_UDPLITE:
case IPPROTO_TCP:
case IPPROTO_SCTP:
- case IPPROTO_DCCP:
if (maniptype == NF_NAT_MANIP_SRC)
keyptr = &tuple->src.u.all;
else
@@ -467,12 +667,15 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
find_free_id:
if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
- else
+ else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) ||
+ maniptype != NF_NAT_MANIP_DST)
off = get_random_u16();
+ else
+ off = 0;
attempts = range_size;
- if (attempts > max_attempts)
- attempts = max_attempts;
+ if (attempts > NF_NAT_MAX_ATTEMPTS)
+ attempts = NF_NAT_MAX_ATTEMPTS;
/* We are in softirq; doing a search of the entire range risks
* soft lockup when all tuples are already used.
@@ -483,7 +686,7 @@ find_free_id:
another_round:
for (i = 0; i < attempts; i++, off++) {
*keyptr = htons(min + off % range_size);
- if (!nf_nat_used_tuple(tuple, ct))
+ if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
return;
}
@@ -523,8 +726,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
if (maniptype == NF_NAT_MANIP_SRC &&
!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
/* try the original tuple first */
- if (in_range(orig_tuple, range)) {
- if (!nf_nat_used_tuple(orig_tuple, ct)) {
+ if (nf_in_range(orig_tuple, range)) {
+ if (!nf_nat_used_tuple_new(orig_tuple, ct)) {
*tuple = *orig_tuple;
return;
}
@@ -549,8 +752,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
l4proto_in_range(tuple, maniptype,
- &range->min_proto,
- &range->max_proto) &&
+ &range->min_proto,
+ &range->max_proto) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
return;
@@ -887,10 +1090,8 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
range->flags |= NF_NAT_RANGE_MAP_IPS;
}
- if (tb[CTA_NAT_V4_MAXIP])
- range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
- else
- range->max_addr.ip = range->min_addr.ip;
+ range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP],
+ range->min_addr.ip);
return 0;
}
@@ -1017,7 +1218,7 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
if (!nat_proto_net->nat_hook_ops) {
WARN_ON(nat_proto_net->users != 0);
- nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
+ nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL);
if (!nat_ops) {
mutex_unlock(&nf_nat_proto_mutex);
return -ENOMEM;
@@ -1121,7 +1322,6 @@ static const struct nf_nat_hook nat_hook = {
#ifdef CONFIG_XFRM
.decode_session = __nf_nat_decode_session,
#endif
- .manip_pkt = nf_nat_manip_pkt,
.remove_nat_bysrc = nf_nat_cleanup_conntrack,
};
@@ -1179,6 +1379,7 @@ static void __exit nf_nat_cleanup(void)
}
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Network address translation core");
module_init(nf_nat_init);
module_exit(nf_nat_cleanup);
diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c
index 551abd2da614..0f9a559f6207 100644
--- a/net/netfilter/nf_nat_ovs.c
+++ b/net/netfilter/nf_nat_ovs.c
@@ -75,9 +75,10 @@ static int nf_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
}
err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+out:
if (err == NF_ACCEPT)
*action |= BIT(maniptype);
-out:
+
return err;
}
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 48cc60084d28..b14a434b9561 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -180,46 +180,6 @@ tcp_manip_pkt(struct sk_buff *skb,
}
static bool
-dccp_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- struct dccp_hdr *hdr;
- __be16 *portptr, oldport, newport;
- int hdrsize = 8; /* DCCP connection tracking guarantees this much */
-
- if (skb->len >= hdroff + sizeof(struct dccp_hdr))
- hdrsize = sizeof(struct dccp_hdr);
-
- if (skb_ensure_writable(skb, hdroff + hdrsize))
- return false;
-
- hdr = (struct dccp_hdr *)(skb->data + hdroff);
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- newport = tuple->src.u.dccp.port;
- portptr = &hdr->dccph_sport;
- } else {
- newport = tuple->dst.u.dccp.port;
- portptr = &hdr->dccph_dport;
- }
-
- oldport = *portptr;
- *portptr = newport;
-
- if (hdrsize < sizeof(*hdr))
- return true;
-
- nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype);
- inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
- false);
-#endif
- return true;
-}
-
-static bool
icmp_manip_pkt(struct sk_buff *skb,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
@@ -338,9 +298,6 @@ static bool l4proto_manip_pkt(struct sk_buff *skb,
case IPPROTO_ICMPV6:
return icmpv6_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
- case IPPROTO_DCCP:
- return dccp_manip_pkt(skb, iphdroff, hdroff,
- tuple, maniptype);
case IPPROTO_GRE:
return gre_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
@@ -668,7 +625,7 @@ static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int
struct flowi fl;
int err;
- err = xfrm_decode_session(skb, &fl, family);
+ err = xfrm_decode_session(net, skb, &fl, family);
if (err < 0)
return err;
@@ -697,6 +654,31 @@ static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int
}
#endif
+static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport)
+{
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ const struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ break;
+ default:
+ return false;
+ }
+
+ dir = CTINFO2DIR(ctinfo);
+ if (dir != IP_CT_DIR_ORIGINAL)
+ return false;
+
+ return ct->tuplehash[!dir].tuple.dst.u.all != sport;
+}
+
static unsigned int
nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -707,8 +689,20 @@ nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
ret = nf_nat_ipv4_fn(priv, skb, state);
- if (ret == NF_ACCEPT && sk && saddr != ip_hdr(skb)->saddr &&
- !inet_sk_transparent(sk))
+ if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
+ return ret;
+
+ /* skb has a socket assigned via tcp edemux. We need to check
+ * if nf_nat_ipv4_fn() has mangled the packet in a way that
+ * edemux would not have found this socket.
+ *
+ * This includes both changes to the source address and changes
+ * to the source port, which are both handled by the
+ * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux
+ * might have found a socket for the old (pre-snat) address.
+ */
+ if (saddr != ip_hdr(skb)->saddr ||
+ nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
skb_orphan(skb); /* TCP edemux obtained wrong socket */
return ret;
@@ -938,14 +932,36 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
}
static unsigned int
+nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct in6_addr saddr = ipv6_hdr(skb)->saddr;
+ struct sock *sk = skb->sk;
+ unsigned int ret;
+
+ ret = nf_nat_ipv6_fn(priv, skb, state);
+
+ if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
+ return ret;
+
+ /* see nf_nat_ipv4_local_in */
+ if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) ||
+ nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
+ skb_orphan(skb);
+
+ return ret;
+}
+
+static unsigned int
nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- unsigned int ret;
+ unsigned int ret, verdict;
struct in6_addr daddr = ipv6_hdr(skb)->daddr;
ret = nf_nat_ipv6_fn(priv, skb, state);
- if (ret != NF_DROP && ret != NF_STOLEN &&
+ verdict = ret & NF_VERDICT_MASK;
+ if (verdict != NF_DROP && verdict != NF_STOLEN &&
ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
skb_dst_drop(skb);
@@ -1051,7 +1067,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
},
/* After packet filtering, change source */
{
- .hook = nf_nat_ipv6_fn,
+ .hook = nf_nat_ipv6_local_in,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC,
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index f91579c821e9..5b37487d9d11 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -10,6 +10,7 @@
#include <linux/if.h>
#include <linux/inetdevice.h>
+#include <linux/in.h>
#include <linux/ip.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
@@ -24,81 +25,104 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_redirect.h>
+static unsigned int
+nf_nat_redirect(struct sk_buff *skb, const struct nf_nat_range2 *range,
+ const union nf_inet_addr *newdst)
+{
+ struct nf_nat_range2 newrange;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+
+ memset(&newrange, 0, sizeof(newrange));
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr = *newdst;
+ newrange.max_addr = *newdst;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+}
+
unsigned int
-nf_nat_redirect_ipv4(struct sk_buff *skb,
- const struct nf_nat_ipv4_multi_range_compat *mr,
+nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range,
unsigned int hooknum)
{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- __be32 newdst;
- struct nf_nat_range2 newrange;
+ union nf_inet_addr newdst = {};
WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
hooknum != NF_INET_LOCAL_OUT);
- ct = nf_ct_get(skb, &ctinfo);
- WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
-
/* Local packets: make them go to loopback */
if (hooknum == NF_INET_LOCAL_OUT) {
- newdst = htonl(0x7F000001);
+ newdst.ip = htonl(INADDR_LOOPBACK);
} else {
const struct in_device *indev;
- newdst = 0;
-
indev = __in_dev_get_rcu(skb->dev);
if (indev) {
const struct in_ifaddr *ifa;
ifa = rcu_dereference(indev->ifa_list);
if (ifa)
- newdst = ifa->ifa_local;
+ newdst.ip = ifa->ifa_local;
}
- if (!newdst)
+ if (!newdst.ip)
return NF_DROP;
}
- /* Transfer from original range. */
- memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
- memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
- newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
- newrange.min_addr.ip = newdst;
- newrange.max_addr.ip = newdst;
- newrange.min_proto = mr->range[0].min;
- newrange.max_proto = mr->range[0].max;
-
- /* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+ return nf_nat_redirect(skb, range, &newdst);
}
EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
+static bool nf_nat_redirect_ipv6_usable(const struct inet6_ifaddr *ifa, unsigned int scope)
+{
+ unsigned int ifa_addr_type = ipv6_addr_type(&ifa->addr);
+
+ if (ifa_addr_type & IPV6_ADDR_MAPPED)
+ return false;
+
+ if ((ifa->flags & IFA_F_TENTATIVE) && (!(ifa->flags & IFA_F_OPTIMISTIC)))
+ return false;
+
+ if (scope) {
+ unsigned int ifa_scope = ifa_addr_type & IPV6_ADDR_SCOPE_MASK;
+
+ if (!(scope & ifa_scope))
+ return false;
+ }
+
+ return true;
+}
+
unsigned int
nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
unsigned int hooknum)
{
- struct nf_nat_range2 newrange;
- struct in6_addr newdst;
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
+ union nf_inet_addr newdst = {};
- ct = nf_ct_get(skb, &ctinfo);
if (hooknum == NF_INET_LOCAL_OUT) {
- newdst = loopback_addr;
+ newdst.in6 = loopback_addr;
} else {
+ unsigned int scope = ipv6_addr_scope(&ipv6_hdr(skb)->daddr);
struct inet6_dev *idev;
- struct inet6_ifaddr *ifa;
bool addr = false;
idev = __in6_dev_get(skb->dev);
if (idev != NULL) {
+ const struct inet6_ifaddr *ifa;
+
read_lock_bh(&idev->lock);
list_for_each_entry(ifa, &idev->addr_list, if_list) {
- newdst = ifa->addr;
+ if (!nf_nat_redirect_ipv6_usable(ifa, scope))
+ continue;
+
+ newdst.in6 = ifa->addr;
addr = true;
break;
}
@@ -109,12 +133,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
return NF_DROP;
}
- newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
- newrange.min_addr.in6 = newdst;
- newrange.max_addr.in6 = newdst;
- newrange.min_proto = range->min_proto;
- newrange.max_proto = range->max_proto;
-
- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+ return nf_nat_redirect(skb, range, &newdst);
}
EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 63d1516816b1..7f12e56e6e52 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -82,11 +82,9 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
const struct sk_buff *skb = entry->skb;
- struct nf_bridge_info *nf_bridge;
- nf_bridge = nf_bridge_info_get(skb);
- if (nf_bridge) {
- entry->physin = nf_bridge_get_physindev(skb);
+ if (nf_bridge_info_exists(skb)) {
+ entry->physin = nf_bridge_get_physindev(skb, entry->state.net);
entry->physout = nf_bridge_get_physoutdev(skb);
} else {
entry->physin = NULL;
@@ -250,109 +248,3 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
return 0;
}
EXPORT_SYMBOL_GPL(nf_queue);
-
-static unsigned int nf_iterate(struct sk_buff *skb,
- struct nf_hook_state *state,
- const struct nf_hook_entries *hooks,
- unsigned int *index)
-{
- const struct nf_hook_entry *hook;
- unsigned int verdict, i = *index;
-
- while (i < hooks->num_hook_entries) {
- hook = &hooks->hooks[i];
-repeat:
- verdict = nf_hook_entry_hookfn(hook, skb, state);
- if (verdict != NF_ACCEPT) {
- *index = i;
- if (verdict != NF_REPEAT)
- return verdict;
- goto repeat;
- }
- i++;
- }
-
- *index = i;
- return NF_ACCEPT;
-}
-
-static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
-{
- switch (pf) {
-#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
- case NFPROTO_BRIDGE:
- return rcu_dereference(net->nf.hooks_bridge[hooknum]);
-#endif
- case NFPROTO_IPV4:
- return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
- case NFPROTO_IPV6:
- return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
- default:
- WARN_ON_ONCE(1);
- return NULL;
- }
-
- return NULL;
-}
-
-/* Caller must hold rcu read-side lock */
-void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
-{
- const struct nf_hook_entry *hook_entry;
- const struct nf_hook_entries *hooks;
- struct sk_buff *skb = entry->skb;
- const struct net *net;
- unsigned int i;
- int err;
- u8 pf;
-
- net = entry->state.net;
- pf = entry->state.pf;
-
- hooks = nf_hook_entries_head(net, pf, entry->state.hook);
-
- i = entry->hook_index;
- if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
- kfree_skb(skb);
- nf_queue_entry_free(entry);
- return;
- }
-
- hook_entry = &hooks->hooks[i];
-
- /* Continue traversal iff userspace said ok... */
- if (verdict == NF_REPEAT)
- verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
-
- if (verdict == NF_ACCEPT) {
- if (nf_reroute(skb, entry) < 0)
- verdict = NF_DROP;
- }
-
- if (verdict == NF_ACCEPT) {
-next_hook:
- ++i;
- verdict = nf_iterate(skb, &entry->state, hooks, &i);
- }
-
- switch (verdict & NF_VERDICT_MASK) {
- case NF_ACCEPT:
- case NF_STOP:
- local_bh_disable();
- entry->state.okfn(entry->state.net, entry->state.sk, skb);
- local_bh_enable();
- break;
- case NF_QUEUE:
- err = nf_queue(skb, &entry->state, i, verdict);
- if (err == 1)
- goto next_hook;
- break;
- case NF_STOLEN:
- break;
- default:
- kfree_skb(skb);
- }
-
- nf_queue_entry_free(entry);
-}
-EXPORT_SYMBOL(nf_reinject);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 16915f8eef2b..3fa3f5dfb264 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -5,7 +5,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/tcp.h>
#include <net/netns/generic.h>
#include <linux/proc_fs.h>
@@ -153,7 +153,7 @@ void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info,
struct synproxy_options *opts)
{
opts->tsecr = opts->tsval;
- opts->tsval = tcp_time_stamp_raw() & ~0x3f;
+ opts->tsval = tcp_clock_ms() & ~0x3f;
if (opts->options & NF_SYNPROXY_OPT_WSCALE) {
opts->tsval |= opts->wscale;
@@ -617,7 +617,7 @@ synproxy_recv_client_ack(struct net *net,
struct synproxy_net *snet = synproxy_pernet(net);
int mss;
- mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ mss = __cookie_v4_check(ip_hdr(skb), th);
if (mss == 0) {
this_cpu_inc(snet->stats->cookie_invalid);
return false;
@@ -800,7 +800,7 @@ synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb,
skb_reset_network_header(skb);
iph = skb_put(skb, sizeof(*iph));
ip6_flow_hdr(iph, 0, 0);
- iph->hop_limit = net->ipv6.devconf_all->hop_limit;
+ iph->hop_limit = READ_ONCE(net->ipv6.devconf_all->hop_limit);
iph->nexthdr = IPPROTO_TCP;
iph->saddr = *saddr;
iph->daddr = *daddr;
@@ -1034,7 +1034,7 @@ synproxy_recv_client_ack_ipv6(struct net *net,
struct synproxy_net *snet = synproxy_pernet(net);
int mss;
- mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ mss = nf_cookie_v6_check(ipv6_hdr(skb), th);
if (mss == 0) {
this_cpu_inc(snet->stats->cookie_invalid);
return false;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8c09e4d12ac1..f3de2f9bbebf 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -24,14 +24,19 @@
#include <net/sock.h>
#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
+#define NFT_SET_MAX_ANONLEN 16
+
+/* limit compaction to avoid huge kmalloc/krealloc sizes. */
+#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem))
unsigned int nf_tables_net_id __read_mostly;
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
-static LIST_HEAD(nf_tables_destroy_list);
+static LIST_HEAD(nf_tables_gc_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
+static DEFINE_SPINLOCK(nf_tables_gc_list_lock);
enum {
NFT_VALIDATE_SKIP = 0,
@@ -100,13 +105,12 @@ static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types
[NFT_MSG_NEWFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_REGISTER,
[NFT_MSG_GETFLOWTABLE] = AUDIT_NFT_OP_INVALID,
[NFT_MSG_DELFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+ [NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET,
};
-static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state)
{
- struct nftables_pernet *nft_net = nft_pernet(net);
-
- switch (nft_net->validate_state) {
+ switch (table->validate_state) {
case NFT_VALIDATE_SKIP:
WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
break;
@@ -117,10 +121,12 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state)
return;
}
- nft_net->validate_state = new_validate_state;
+ table->validate_state = new_validate_state;
}
static void nf_tables_trans_destroy_work(struct work_struct *w);
-static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
+
+static void nft_trans_gc_work(struct work_struct *work);
+static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);
static void nft_ctx_init(struct nft_ctx *ctx,
struct net *net,
@@ -141,37 +147,61 @@ static void nft_ctx_init(struct nft_ctx *ctx,
ctx->report = nlmsg_report(nlh);
ctx->flags = nlh->nlmsg_flags;
ctx->seq = nlh->nlmsg_seq;
+
+ bitmap_zero(ctx->reg_inited, NFT_REG32_NUM);
}
-static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
- int msg_type, u32 size, gfp_t gfp)
+static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
+ int msg_type, u32 size)
{
struct nft_trans *trans;
- trans = kzalloc(sizeof(struct nft_trans) + size, gfp);
+ trans = kzalloc(size, GFP_KERNEL);
if (trans == NULL)
return NULL;
INIT_LIST_HEAD(&trans->list);
trans->msg_type = msg_type;
- trans->ctx = *ctx;
+
+ trans->net = ctx->net;
+ trans->table = ctx->table;
+ trans->seq = ctx->seq;
+ trans->flags = ctx->flags;
+ trans->report = ctx->report;
return trans;
}
-static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
- int msg_type, u32 size)
+static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans)
+{
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ case NFT_MSG_NEWSET:
+ return container_of(trans, struct nft_trans_binding, nft_trans);
+ }
+
+ return NULL;
+}
+
+static void nft_trans_list_del(struct nft_trans *trans)
{
- return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
+ struct nft_trans_binding *trans_binding;
+
+ list_del(&trans->list);
+
+ trans_binding = nft_trans_get_binding(trans);
+ if (trans_binding)
+ list_del(&trans_binding->binding_list);
}
static void nft_trans_destroy(struct nft_trans *trans)
{
- list_del(&trans->list);
+ nft_trans_list_del(trans);
kfree(trans);
}
-static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
+static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set,
+ bool bind)
{
struct nftables_pernet *nft_net;
struct net *net = ctx->net;
@@ -185,53 +215,154 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
switch (trans->msg_type) {
case NFT_MSG_NEWSET:
if (nft_trans_set(trans) == set)
- nft_trans_set_bound(trans) = true;
+ nft_trans_set_bound(trans) = bind;
break;
case NFT_MSG_NEWSETELEM:
if (nft_trans_elem_set(trans) == set)
- nft_trans_elem_set_bound(trans) = true;
+ nft_trans_elem_set_bound(trans) = bind;
+ break;
+ }
+ }
+}
+
+static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ return __nft_set_trans_bind(ctx, set, true);
+}
+
+static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ return __nft_set_trans_bind(ctx, set, false);
+}
+
+static void __nft_chain_trans_bind(const struct nft_ctx *ctx,
+ struct nft_chain *chain, bool bind)
+{
+ struct nftables_pernet *nft_net;
+ struct net *net = ctx->net;
+ struct nft_trans *trans;
+
+ if (!nft_chain_binding(chain))
+ return;
+
+ nft_net = nft_pernet(net);
+ list_for_each_entry_reverse(trans, &nft_net->commit_list, list) {
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ if (nft_trans_chain(trans) == chain)
+ nft_trans_chain_bound(trans) = bind;
+ break;
+ case NFT_MSG_NEWRULE:
+ if (nft_trans_rule_chain(trans) == chain)
+ nft_trans_rule_bound(trans) = bind;
break;
}
}
}
+static void nft_chain_trans_bind(const struct nft_ctx *ctx,
+ struct nft_chain *chain)
+{
+ __nft_chain_trans_bind(ctx, chain, true);
+}
+
+int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
+{
+ if (!nft_chain_binding(chain))
+ return 0;
+
+ if (nft_chain_binding(ctx->chain))
+ return -EOPNOTSUPP;
+
+ if (chain->bound)
+ return -EBUSY;
+
+ if (!nft_use_inc(&chain->use))
+ return -EMFILE;
+
+ chain->bound = true;
+ nft_chain_trans_bind(ctx, chain);
+
+ return 0;
+}
+
+void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
+{
+ __nft_chain_trans_bind(ctx, chain, false);
+}
+
static int nft_netdev_register_hooks(struct net *net,
struct list_head *hook_list)
{
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
int err, j;
j = 0;
list_for_each_entry(hook, hook_list, list) {
- err = nf_register_net_hook(net, &hook->ops);
- if (err < 0)
- goto err_register;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ err = nf_register_net_hook(net, ops);
+ if (err < 0)
+ goto err_register;
- j++;
+ j++;
+ }
}
return 0;
err_register:
list_for_each_entry(hook, hook_list, list) {
- if (j-- <= 0)
- break;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (j-- <= 0)
+ break;
- nf_unregister_net_hook(net, &hook->ops);
+ nf_unregister_net_hook(net, ops);
+ }
}
return err;
}
+static void nft_netdev_hook_free_ops(struct nft_hook *hook)
+{
+ struct nf_hook_ops *ops, *next;
+
+ list_for_each_entry_safe(ops, next, &hook->ops_list, list) {
+ list_del(&ops->list);
+ kfree(ops);
+ }
+}
+
+static void nft_netdev_hook_free(struct nft_hook *hook)
+{
+ nft_netdev_hook_free_ops(hook);
+ kfree(hook);
+}
+
+static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu)
+{
+ struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu);
+
+ nft_netdev_hook_free(hook);
+}
+
+static void nft_netdev_hook_free_rcu(struct nft_hook *hook)
+{
+ call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu);
+}
+
static void nft_netdev_unregister_hooks(struct net *net,
struct list_head *hook_list,
bool release_netdev)
{
struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
list_for_each_entry_safe(hook, next, hook_list, list) {
- nf_unregister_net_hook(net, &hook->ops);
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nf_unregister_net_hook(net, ops);
if (release_netdev) {
list_del(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
}
}
@@ -290,11 +421,130 @@ static void nf_tables_unregister_hook(struct net *net,
return __nf_tables_unregister_hook(net, table, chain, false);
}
+static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b)
+{
+ /* NB: the ->bound equality check is defensive, at this time we only merge
+ * a new nft_trans_elem transaction request with the transaction tail
+ * element, but a->bound != b->bound would imply a NEWRULE transaction
+ * is queued in-between.
+ *
+ * The set check is mandatory, the NFT_MAX_SET_NELEMS check prevents
+ * huge krealloc() requests.
+ */
+ return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS;
+}
+
+static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
+ struct nft_trans_elem *tail,
+ struct nft_trans_elem *trans)
+{
+ unsigned int nelems, old_nelems = tail->nelems;
+ struct nft_trans_elem *new_trans;
+
+ if (!nft_trans_collapse_set_elem_allowed(tail, trans))
+ return false;
+
+ /* "cannot happen", at this time userspace element add
+ * requests always allocate a new transaction element.
+ *
+ * This serves as a reminder to adjust the list_add_tail
+ * logic below in case this ever changes.
+ */
+ if (WARN_ON_ONCE(trans->nelems != 1))
+ return false;
+
+ if (check_add_overflow(old_nelems, trans->nelems, &nelems))
+ return false;
+
+ /* krealloc might free tail which invalidates list pointers */
+ list_del_init(&tail->nft_trans.list);
+
+ new_trans = krealloc(tail, struct_size(tail, elems, nelems),
+ GFP_KERNEL);
+ if (!new_trans) {
+ list_add_tail(&tail->nft_trans.list,
+ &nft_net->commit_list);
+ return false;
+ }
+
+ /*
+ * new_trans->nft_trans.list contains garbage, but
+ * list_add_tail() doesn't care.
+ */
+ new_trans->nelems = nelems;
+ new_trans->elems[old_nelems] = trans->elems[0];
+ list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list);
+
+ return true;
+}
+
+static bool nft_trans_try_collapse(struct nftables_pernet *nft_net,
+ struct nft_trans *trans)
+{
+ struct nft_trans *tail;
+
+ if (list_empty(&nft_net->commit_list))
+ return false;
+
+ tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list);
+
+ if (tail->msg_type != trans->msg_type)
+ return false;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSETELEM:
+ case NFT_MSG_DELSETELEM:
+ return nft_trans_collapse_set_elem(nft_net,
+ nft_trans_container_elem(tail),
+ nft_trans_container_elem(trans));
+ }
+
+ return false;
+}
+
static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans)
{
struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_trans_binding *binding;
+ struct nft_trans_set *trans_set;
list_add_tail(&trans->list, &nft_net->commit_list);
+
+ binding = nft_trans_get_binding(trans);
+ if (!binding)
+ return;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSET:
+ trans_set = nft_trans_container_set(trans);
+
+ if (!nft_trans_set_update(trans) &&
+ nft_set_is_anonymous(nft_trans_set(trans)))
+ list_add_tail(&binding->binding_list, &nft_net->binding_list);
+
+ list_add_tail(&trans_set->list_trans_newset, &nft_net->commit_set_list);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (!nft_trans_chain_update(trans) &&
+ nft_chain_binding(nft_trans_chain(trans)))
+ list_add_tail(&binding->binding_list, &nft_net->binding_list);
+ break;
+ }
+}
+
+static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM &&
+ trans->msg_type != NFT_MSG_DELSETELEM);
+
+ if (nft_trans_try_collapse(nft_net, trans)) {
+ kfree(trans);
+ return;
+ }
+
+ nft_trans_commit_list_add_tail(net, trans);
}
static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -324,11 +574,28 @@ static int nft_deltable(struct nft_ctx *ctx)
return err;
}
-static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
+static struct nft_trans *
+nft_trans_alloc_chain(const struct nft_ctx *ctx, int msg_type)
{
+ struct nft_trans_chain *trans_chain;
struct nft_trans *trans;
trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain));
+ if (!trans)
+ return NULL;
+
+ trans_chain = nft_trans_container_chain(trans);
+ INIT_LIST_HEAD(&trans_chain->nft_trans_binding.binding_list);
+ trans_chain->chain = ctx->chain;
+
+ return trans;
+}
+
+static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc_chain(ctx, msg_type);
if (trans == NULL)
return ERR_PTR(-ENOMEM);
@@ -340,8 +607,8 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID]));
}
}
-
nft_trans_commit_list_add_tail(ctx->net, trans);
+
return trans;
}
@@ -353,14 +620,13 @@ static int nft_delchain(struct nft_ctx *ctx)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
nft_deactivate_next(ctx->net, ctx->chain);
return 0;
}
-static void nft_rule_expr_activate(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule)
{
struct nft_expr *expr;
@@ -373,9 +639,8 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx,
}
}
-static void nft_rule_expr_deactivate(const struct nft_ctx *ctx,
- struct nft_rule *rule,
- enum nft_trans_phase phase)
+void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule,
+ enum nft_trans_phase phase)
{
struct nft_expr *expr;
@@ -394,7 +659,7 @@ nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule)
/* You cannot delete the same rule twice */
if (nft_is_active_next(ctx->net, rule)) {
nft_deactivate_next(ctx->net, rule);
- ctx->chain->use--;
+ nft_use_dec(&ctx->chain->use);
return 0;
}
return -ENOENT;
@@ -414,6 +679,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
}
nft_trans_rule(trans) = rule;
+ nft_trans_rule_chain(trans) = ctx->chain;
nft_trans_commit_list_add_tail(ctx->net, trans);
return trans;
@@ -469,12 +735,17 @@ static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
struct nft_set *set,
const struct nft_set_desc *desc)
{
+ struct nft_trans_set *trans_set;
struct nft_trans *trans;
trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set));
if (trans == NULL)
return -ENOMEM;
+ trans_set = nft_trans_container_set(trans);
+ INIT_LIST_HEAD(&trans_set->nft_trans_binding.binding_list);
+ INIT_LIST_HEAD(&trans_set->list_trans_newset);
+
if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) {
nft_trans_set_id(trans) =
ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID]));
@@ -485,6 +756,7 @@ static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
nft_trans_set_update(trans) = true;
nft_trans_set_gc_int(trans) = desc->gc_int;
nft_trans_set_timeout(trans) = desc->timeout;
+ nft_trans_set_size(trans) = desc->size;
}
nft_trans_commit_list_add_tail(ctx->net, trans);
@@ -497,6 +769,60 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
return __nft_trans_set_add(ctx, msg_type, set, NULL);
}
+static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ nft_set_elem_change_active(ctx->net, set, ext);
+ nft_setelem_data_deactivate(ctx->net, set, elem_priv);
+
+ return 0;
+}
+
+struct nft_set_elem_catchall {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct nft_elem_priv *elem;
+};
+
+static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ nft_set_elem_change_active(ctx->net, set, ext);
+ nft_setelem_data_deactivate(ctx->net, set, catchall->elem);
+ break;
+ }
+}
+
+static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_mapelem_deactivate,
+ };
+
+ set->ops->walk(ctx, set, &iter);
+ WARN_ON_ONCE(iter.err);
+
+ nft_map_catchall_deactivate(ctx, set);
+}
+
static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
{
int err;
@@ -505,8 +831,11 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
if (err < 0)
return err;
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(ctx, set);
+
nft_deactivate_next(ctx->net, set);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
return err;
}
@@ -538,20 +867,21 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
return err;
nft_deactivate_next(ctx->net, obj);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
return err;
}
-static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
- struct nft_flowtable *flowtable)
+static struct nft_trans *
+nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_flowtable *flowtable)
{
struct nft_trans *trans;
trans = nft_trans_alloc(ctx, msg_type,
sizeof(struct nft_trans_flowtable));
if (trans == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
if (msg_type == NFT_MSG_NEWFLOWTABLE)
nft_activate_next(ctx->net, flowtable);
@@ -560,22 +890,22 @@ static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
nft_trans_flowtable(trans) = flowtable;
nft_trans_commit_list_add_tail(ctx->net, trans);
- return 0;
+ return trans;
}
static int nft_delflowtable(struct nft_ctx *ctx,
struct nft_flowtable *flowtable)
{
- int err;
+ struct nft_trans *trans;
- err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable);
- if (err < 0)
- return err;
+ trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
nft_deactivate_next(ctx->net, flowtable);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
- return err;
+ return 0;
}
static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg)
@@ -663,7 +993,7 @@ static struct nft_table *nft_table_lookup(const struct net *net,
static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
const struct nlattr *nla,
- u8 genmask, u32 nlpid)
+ int family, u8 genmask, u32 nlpid)
{
struct nftables_pernet *nft_net;
struct nft_table *table;
@@ -671,6 +1001,7 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
nft_net = nft_pernet(net);
list_for_each_entry(table, &nft_net->tables, list) {
if (be64_to_cpu(nla_get_be64(nla)) == table->handle &&
+ table->family == family &&
nft_active_genmask(table, genmask)) {
if (nft_table_has_owner(table) &&
nlpid && table->nlpid != nlpid)
@@ -792,11 +1123,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
return ERR_PTR(-ENOENT);
}
-static __be16 nft_base_seq(const struct net *net)
+static unsigned int nft_base_seq(const struct net *net)
{
- struct nftables_pernet *nft_net = nft_pernet(net);
+ return READ_ONCE(net->nft.base_seq);
+}
- return htons(nft_net->base_seq & 0xffff);
+static __be16 nft_base_seq_be16(const struct net *net)
+{
+ return htons(nft_base_seq(net) & 0xffff);
}
static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
@@ -814,19 +1148,28 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
{
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
- NFNETLINK_V0, nft_base_seq(net));
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
- nla_put_be32(skb, NFTA_TABLE_FLAGS,
- htonl(table->flags & NFT_TABLE_F_MASK)) ||
nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) ||
nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle),
NFTA_TABLE_PAD))
goto nla_put_failure;
+
+ if (event == NFT_MSG_DELTABLE ||
+ event == NFT_MSG_DESTROYTABLE) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
+ if (nla_put_be32(skb, NFTA_TABLE_FLAGS,
+ htonl(table->flags & NFT_TABLE_F_MASK)))
+ goto nla_put_failure;
+
if (nft_table_has_owner(table) &&
nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid)))
goto nla_put_failure;
@@ -900,7 +1243,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -1044,8 +1387,30 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table)
#define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1)
#define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0)
#define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1)
+#define __NFT_TABLE_F_WAS_ORPHAN (__NFT_TABLE_F_INTERNAL << 2)
#define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \
- __NFT_TABLE_F_WAS_AWAKEN)
+ __NFT_TABLE_F_WAS_AWAKEN | \
+ __NFT_TABLE_F_WAS_ORPHAN)
+
+static bool nft_table_pending_update(const struct nft_ctx *ctx)
+{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
+ struct nft_trans *trans;
+
+ if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
+ return true;
+
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->table == ctx->table &&
+ ((trans->msg_type == NFT_MSG_NEWCHAIN &&
+ nft_trans_chain_update(trans)) ||
+ (trans->msg_type == NFT_MSG_DELCHAIN &&
+ nft_is_base_chain(nft_trans_chain(trans)))))
+ return true;
+ }
+
+ return false;
+}
static int nf_tables_updtable(struct nft_ctx *ctx)
{
@@ -1060,15 +1425,22 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
if (flags & ~NFT_TABLE_F_MASK)
return -EOPNOTSUPP;
- if (flags == ctx->table->flags)
+ if (flags == (ctx->table->flags & NFT_TABLE_F_MASK))
return 0;
if ((nft_table_has_owner(ctx->table) &&
!(flags & NFT_TABLE_F_OWNER)) ||
- (!nft_table_has_owner(ctx->table) &&
- flags & NFT_TABLE_F_OWNER))
+ (flags & NFT_TABLE_F_OWNER &&
+ !nft_table_is_orphan(ctx->table)))
return -EOPNOTSUPP;
+ if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST)
+ return -EOPNOTSUPP;
+
+ /* No dormant off/on/off/on games in single transaction */
+ if (nft_table_pending_update(ctx))
+ return -EINVAL;
+
trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
sizeof(struct nft_trans_table));
if (trans == NULL)
@@ -1091,12 +1463,20 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
}
}
+ if ((flags & NFT_TABLE_F_OWNER) &&
+ !nft_table_has_owner(ctx->table)) {
+ ctx->table->nlpid = ctx->portid;
+ ctx->table->flags |= NFT_TABLE_F_OWNER |
+ __NFT_TABLE_F_WAS_ORPHAN;
+ }
+
nft_trans_table_update(trans) = true;
nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
err_register_hooks:
+ ctx->table->flags |= NFT_TABLE_F_DORMANT;
nft_trans_destroy(trans);
return ret;
}
@@ -1224,6 +1604,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
if (table == NULL)
goto err_kzalloc;
+ table->validate_state = nft_net->validate_state;
table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
if (table->name == NULL)
goto err_strdup;
@@ -1281,7 +1662,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, chain))
continue;
- if (nft_chain_is_bound(chain))
+ if (nft_chain_binding(chain))
continue;
ctx->chain = chain;
@@ -1295,8 +1676,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, set))
continue;
- if (nft_set_is_anonymous(set) &&
- !list_empty(&set->bindings))
+ if (nft_set_is_anonymous(set))
continue;
err = nft_delset(ctx, set);
@@ -1326,7 +1706,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, chain))
continue;
- if (nft_chain_is_bound(chain))
+ if (nft_chain_binding(chain))
continue;
ctx->chain = chain;
@@ -1392,7 +1772,7 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info,
if (nla[NFTA_TABLE_HANDLE]) {
attr = nla[NFTA_TABLE_HANDLE];
- table = nft_table_lookup_byhandle(net, attr, genmask,
+ table = nft_table_lookup_byhandle(net, attr, family, genmask,
NETLINK_CB(skb).portid);
} else {
attr = nla[NFTA_TABLE_NAME];
@@ -1401,6 +1781,10 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info,
}
if (IS_ERR(table)) {
+ if (PTR_ERR(table) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(table);
}
@@ -1415,15 +1799,15 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info,
return nft_flush_table(&ctx);
}
-static void nf_tables_table_destroy(struct nft_ctx *ctx)
+static void nf_tables_table_destroy(struct nft_table *table)
{
- if (WARN_ON(ctx->table->use > 0))
+ if (WARN_ON(table->use > 0))
return;
- rhltable_destroy(&ctx->table->chains_ht);
- kfree(ctx->table->name);
- kfree(ctx->table->udata);
- kfree(ctx->table);
+ rhltable_destroy(&table->chains_ht);
+ kfree(table->name);
+ kfree(table->udata);
+ kfree(table);
}
void nft_register_chain_type(const struct nft_chain_type *ctype)
@@ -1570,8 +1954,22 @@ nla_put_failure:
return -ENOSPC;
}
-static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
- const struct nft_base_chain *basechain)
+static bool hook_is_prefix(struct nft_hook *hook)
+{
+ return strlen(hook->ifname) >= hook->ifnamelen;
+}
+
+static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook)
+{
+ int attr = hook_is_prefix(hook) ? NFTA_DEVICE_PREFIX : NFTA_DEVICE_NAME;
+
+ return nla_put_string(skb, attr, hook->ifname);
+}
+
+static int nft_dump_basechain_hook(struct sk_buff *skb,
+ const struct net *net, int family,
+ const struct nft_base_chain *basechain,
+ const struct list_head *hook_list)
{
const struct nf_hook_ops *ops = &basechain->ops;
struct nft_hook *hook, *first = NULL;
@@ -1588,19 +1986,26 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
if (nft_base_chain_netdev(family, ops->hooknum)) {
nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
- list_for_each_entry(hook, &basechain->hook_list, list) {
+ if (!nest_devs)
+ goto nla_put_failure;
+
+ if (!hook_list)
+ hook_list = &basechain->hook_list;
+
+ list_for_each_entry_rcu(hook, hook_list, list,
+ lockdep_commit_lock_is_held(net)) {
if (!first)
first = hook;
- if (nla_put_string(skb, NFTA_DEVICE_NAME,
- hook->ops.dev->name))
+ if (nft_nla_put_hook_dev(skb, hook))
goto nla_put_failure;
n++;
}
nla_nest_end(skb, nest_devs);
if (n == 1 &&
- nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name))
+ !hook_is_prefix(first) &&
+ nla_put_string(skb, NFTA_HOOK_DEV, first->ifname))
goto nla_put_failure;
}
nla_nest_end(skb, nest);
@@ -1613,29 +2018,35 @@ nla_put_failure:
static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq, int event, u32 flags,
int family, const struct nft_table *table,
- const struct nft_chain *chain)
+ const struct nft_chain *chain,
+ const struct list_head *hook_list)
{
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
- NFNETLINK_V0, nft_base_seq(net));
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
- if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
- goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
+ if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name) ||
+ nla_put_string(skb, NFTA_CHAIN_NAME, chain->name) ||
+ nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
NFTA_CHAIN_PAD))
goto nla_put_failure;
- if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name))
- goto nla_put_failure;
+
+ if (!hook_list &&
+ (event == NFT_MSG_DELCHAIN ||
+ event == NFT_MSG_DESTROYCHAIN)) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
if (nft_is_base_chain(chain)) {
const struct nft_base_chain *basechain = nft_base_chain(chain);
struct nft_stats __percpu *stats;
- if (nft_dump_basechain_hook(skb, family, basechain))
+ if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
@@ -1670,7 +2081,8 @@ nla_put_failure:
return -1;
}
-static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
+static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event,
+ const struct list_head *hook_list)
{
struct nftables_pernet *nft_net;
struct sk_buff *skb;
@@ -1690,7 +2102,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
event, flags, ctx->family, ctx->table,
- ctx->chain);
+ ctx->chain, hook_list);
if (err < 0) {
kfree_skb(skb);
goto err;
@@ -1716,7 +2128,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -1736,7 +2148,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
NFT_MSG_NEWCHAIN,
NLM_F_MULTI,
table->family, table,
- chain) < 0)
+ chain, NULL) < 0)
goto done;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -1790,7 +2202,7 @@ static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN,
- 0, family, table, chain);
+ 0, family, table, chain, NULL);
if (err < 0)
goto err_fill_chain_info;
@@ -1816,14 +2228,14 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr,
nft_counter_policy, NULL);
if (err < 0)
- return ERR_PTR(err);
+ return ERR_PTR_PCPU(err);
if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
- return ERR_PTR(-EINVAL);
+ return ERR_PTR_PCPU(-EINVAL);
newstats = netdev_alloc_pcpu_stats(struct nft_stats);
if (newstats == NULL)
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR_PCPU(-ENOMEM);
/* Restore old counters on this cpu, no problem. Per-cpu statistics
* are not exposed to userspace.
@@ -1837,18 +2249,19 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
return newstats;
}
-static void nft_chain_stats_replace(struct nft_trans *trans)
+static void nft_chain_stats_replace(struct nft_trans_chain *trans)
{
- struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain);
+ const struct nft_trans *t = &trans->nft_trans_binding.nft_trans;
+ struct nft_base_chain *chain = nft_base_chain(trans->chain);
- if (!nft_trans_chain_stats(trans))
+ if (!trans->stats)
return;
- nft_trans_chain_stats(trans) =
- rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans),
- lockdep_commit_lock_is_held(trans->ctx.net));
+ trans->stats =
+ rcu_replace_pointer(chain->stats, trans->stats,
+ lockdep_commit_lock_is_held(t->net));
- if (!nft_trans_chain_stats(trans))
+ if (!trans->stats)
static_branch_inc(&nft_counters_enabled);
}
@@ -1866,9 +2279,9 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
kvfree(chain->blob_next);
}
-void nf_tables_chain_destroy(struct nft_ctx *ctx)
+void nf_tables_chain_destroy(struct nft_chain *chain)
{
- struct nft_chain *chain = ctx->chain;
+ const struct nft_table *table = chain->table;
struct nft_hook *hook, *next;
if (WARN_ON(chain->use > 0))
@@ -1880,11 +2293,11 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx)
if (nft_is_base_chain(chain)) {
struct nft_base_chain *basechain = nft_base_chain(chain);
- if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
list_for_each_entry_safe(hook, next,
&basechain->hook_list, list) {
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
}
module_put(basechain->type->owner);
@@ -1903,36 +2316,47 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx)
}
static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
- const struct nlattr *attr)
+ const struct nlattr *attr,
+ bool prefix)
{
+ struct nf_hook_ops *ops;
struct net_device *dev;
- char ifname[IFNAMSIZ];
struct nft_hook *hook;
int err;
- hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
- if (!hook) {
- err = -ENOMEM;
- goto err_hook_alloc;
- }
+ hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
+ if (!hook)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&hook->ops_list);
+
+ err = nla_strscpy(hook->ifname, attr, IFNAMSIZ);
+ if (err < 0)
+ goto err_hook_free;
+
+ /* include the terminating NUL-char when comparing non-prefixes */
+ hook->ifnamelen = strlen(hook->ifname) + !prefix;
- nla_strscpy(ifname, attr, IFNAMSIZ);
/* nf_tables_netdev_event() is called under rtnl_mutex, this is
* indirectly serializing all the other holders of the commit_mutex with
* the rtnl_mutex.
*/
- dev = __dev_get_by_name(net, ifname);
- if (!dev) {
- err = -ENOENT;
- goto err_hook_dev;
- }
- hook->ops.dev = dev;
+ for_each_netdev(net, dev) {
+ if (strncmp(dev->name, hook->ifname, hook->ifnamelen))
+ continue;
+ ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT);
+ if (!ops) {
+ err = -ENOMEM;
+ goto err_hook_free;
+ }
+ ops->dev = dev;
+ list_add_tail(&ops->list, &hook->ops_list);
+ }
return hook;
-err_hook_dev:
- kfree(hook);
-err_hook_alloc:
+err_hook_free:
+ nft_netdev_hook_free(hook);
return ERR_PTR(err);
}
@@ -1942,7 +2366,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
struct nft_hook *hook;
list_for_each_entry(hook, hook_list, list) {
- if (this->ops.dev == hook->ops.dev)
+ if (!strncmp(hook->ifname, this->ifname,
+ min(hook->ifnamelen, this->ifnamelen)))
return hook;
}
@@ -1951,25 +2376,36 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
static int nf_tables_parse_netdev_hooks(struct net *net,
const struct nlattr *attr,
- struct list_head *hook_list)
+ struct list_head *hook_list,
+ struct netlink_ext_ack *extack)
{
struct nft_hook *hook, *next;
const struct nlattr *tmp;
int rem, n = 0, err;
+ bool prefix;
nla_for_each_nested(tmp, attr, rem) {
- if (nla_type(tmp) != NFTA_DEVICE_NAME) {
+ switch (nla_type(tmp)) {
+ case NFTA_DEVICE_NAME:
+ prefix = false;
+ break;
+ case NFTA_DEVICE_PREFIX:
+ prefix = true;
+ break;
+ default:
err = -EINVAL;
goto err_hook;
}
- hook = nft_netdev_hook_alloc(net, tmp);
+ hook = nft_netdev_hook_alloc(net, tmp, prefix);
if (IS_ERR(hook)) {
+ NL_SET_BAD_ATTR(extack, tmp);
err = PTR_ERR(hook);
goto err_hook;
}
if (nft_hook_list_find(hook_list, hook)) {
- kfree(hook);
+ NL_SET_BAD_ATTR(extack, tmp);
+ nft_netdev_hook_free(hook);
err = -EEXIST;
goto err_hook;
}
@@ -1987,7 +2423,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net,
err_hook:
list_for_each_entry_safe(hook, next, hook_list, list) {
list_del(&hook->list);
- kfree(hook);
+ nft_netdev_hook_free(hook);
}
return err;
}
@@ -1999,38 +2435,41 @@ struct nft_chain_hook {
struct list_head list;
};
-static int nft_chain_parse_netdev(struct net *net,
- struct nlattr *tb[],
- struct list_head *hook_list)
+static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[],
+ struct list_head *hook_list,
+ struct netlink_ext_ack *extack, u32 flags)
{
struct nft_hook *hook;
int err;
if (tb[NFTA_HOOK_DEV]) {
- hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]);
- if (IS_ERR(hook))
+ hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV], false);
+ if (IS_ERR(hook)) {
+ NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]);
return PTR_ERR(hook);
+ }
list_add_tail(&hook->list, hook_list);
} else if (tb[NFTA_HOOK_DEVS]) {
err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS],
- hook_list);
+ hook_list, extack);
if (err < 0)
return err;
- if (list_empty(hook_list))
- return -EINVAL;
- } else {
- return -EINVAL;
}
+ if (flags & NFT_CHAIN_HW_OFFLOAD &&
+ list_empty(hook_list))
+ return -EINVAL;
+
return 0;
}
static int nft_chain_parse_hook(struct net *net,
+ struct nft_base_chain *basechain,
const struct nlattr * const nla[],
struct nft_chain_hook *hook, u8 family,
- struct netlink_ext_ack *extack, bool autoload)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct nftables_pernet *nft_net = nft_pernet(net);
struct nlattr *ha[NFTA_HOOK_MAX + 1];
@@ -2046,31 +2485,57 @@ static int nft_chain_parse_hook(struct net *net,
if (err < 0)
return err;
- if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
- ha[NFTA_HOOK_PRIORITY] == NULL)
- return -EINVAL;
+ if (!basechain) {
+ if (!ha[NFTA_HOOK_HOOKNUM] ||
+ !ha[NFTA_HOOK_PRIORITY]) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
+ return -ENOENT;
+ }
- hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
- hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+ hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+ hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
- type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
- if (!type)
- return -EOPNOTSUPP;
+ type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
+ if (!type)
+ return -EOPNOTSUPP;
- if (nla[NFTA_CHAIN_TYPE]) {
- type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
- family, autoload);
- if (IS_ERR(type)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
- return PTR_ERR(type);
+ if (nla[NFTA_CHAIN_TYPE]) {
+ type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
+ family, true);
+ if (IS_ERR(type)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
+ return PTR_ERR(type);
+ }
}
- }
- if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
- return -EOPNOTSUPP;
+ if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
+ return -EOPNOTSUPP;
- if (type->type == NFT_CHAIN_T_NAT &&
- hook->priority <= NF_IP_PRI_CONNTRACK)
- return -EOPNOTSUPP;
+ if (type->type == NFT_CHAIN_T_NAT &&
+ hook->priority <= NF_IP_PRI_CONNTRACK)
+ return -EOPNOTSUPP;
+ } else {
+ if (ha[NFTA_HOOK_HOOKNUM]) {
+ hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+ if (hook->num != basechain->ops.hooknum)
+ return -EOPNOTSUPP;
+ }
+ if (ha[NFTA_HOOK_PRIORITY]) {
+ hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+ if (hook->priority != basechain->ops.priority)
+ return -EOPNOTSUPP;
+ }
+
+ if (nla[NFTA_CHAIN_TYPE]) {
+ type = __nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
+ family);
+ if (!type) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
+ return -ENOENT;
+ }
+ } else {
+ type = basechain->type;
+ }
+ }
if (!try_module_get(type->owner)) {
if (nla[NFTA_CHAIN_TYPE])
@@ -2082,7 +2547,7 @@ static int nft_chain_parse_hook(struct net *net,
INIT_LIST_HEAD(&hook->list);
if (nft_base_chain_netdev(family, hook->num)) {
- err = nft_chain_parse_netdev(net, ha, &hook->list);
+ err = nft_chain_parse_netdev(net, ha, &hook->list, extack, flags);
if (err < 0) {
module_put(type->owner);
return err;
@@ -2101,43 +2566,39 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
list_for_each_entry_safe(h, next, &hook->list, list) {
list_del(&h->list);
- kfree(h);
+ nft_netdev_hook_free(h);
}
module_put(hook->type->owner);
}
-struct nft_rules_old {
- struct rcu_head h;
- struct nft_rule_blob *blob;
-};
-
-static void nft_last_rule(struct nft_rule_blob *blob, const void *ptr)
+static void nft_last_rule(const struct nft_chain *chain, const void *ptr)
{
- struct nft_rule_dp *prule;
+ struct nft_rule_dp_last *lrule;
- prule = (struct nft_rule_dp *)ptr;
- prule->is_last = 1;
+ BUILD_BUG_ON(offsetof(struct nft_rule_dp_last, end) != 0);
+
+ lrule = (struct nft_rule_dp_last *)ptr;
+ lrule->end.is_last = 1;
+ lrule->chain = chain;
/* blob size does not include the trailer rule */
}
-static struct nft_rule_blob *nf_tables_chain_alloc_rules(unsigned int size)
+static struct nft_rule_blob *nf_tables_chain_alloc_rules(const struct nft_chain *chain,
+ unsigned int size)
{
struct nft_rule_blob *blob;
- /* size must include room for the last rule */
- if (size < offsetof(struct nft_rule_dp, data))
- return NULL;
-
- size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rules_old);
if (size > INT_MAX)
return NULL;
+ size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rule_dp_last);
+
blob = kvmalloc(size, GFP_KERNEL_ACCOUNT);
if (!blob)
return NULL;
blob->size = 0;
- nft_last_rule(blob, blob->data);
+ nft_last_rule(chain, blob->data);
return blob;
}
@@ -2158,6 +2619,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
struct nft_chain_hook *hook, u32 flags)
{
struct nft_chain *chain;
+ struct nf_hook_ops *ops;
struct nft_hook *h;
basechain->type = hook->type;
@@ -2166,14 +2628,12 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
if (nft_base_chain_netdev(family, hook->num)) {
list_splice_init(&hook->list, &basechain->hook_list);
- list_for_each_entry(h, &basechain->hook_list, list)
- nft_basechain_hook_init(&h->ops, family, hook, chain);
-
- basechain->ops.hooknum = hook->num;
- basechain->ops.priority = hook->priority;
- } else {
- nft_basechain_hook_init(&basechain->ops, family, hook, chain);
+ list_for_each_entry(h, &basechain->hook_list, list) {
+ list_for_each_entry(ops, &h->ops_list, list)
+ nft_basechain_hook_init(ops, family, hook, chain);
+ }
}
+ nft_basechain_hook_init(&basechain->ops, family, hook, chain);
chain->flags |= NFT_CHAIN_BASE | flags;
basechain->policy = NF_ACCEPT;
@@ -2188,7 +2648,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
return 0;
}
-static int nft_chain_add(struct nft_table *table, struct nft_chain *chain)
+int nft_chain_add(struct nft_table *table, struct nft_chain *chain)
{
int err;
@@ -2204,9 +2664,8 @@ static int nft_chain_add(struct nft_table *table, struct nft_chain *chain)
static u64 chain_id;
-static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
- u8 policy, u32 flags,
- struct netlink_ext_ack *extack)
+static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy,
+ u32 flags, struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
@@ -2216,21 +2675,20 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
struct nft_rule_blob *blob;
struct nft_trans *trans;
struct nft_chain *chain;
- unsigned int data_size;
int err;
- if (table->use == UINT_MAX)
- return -EOVERFLOW;
-
if (nla[NFTA_CHAIN_HOOK]) {
struct nft_stats __percpu *stats = NULL;
- struct nft_chain_hook hook;
+ struct nft_chain_hook hook = {};
+
+ if (table->flags & __NFT_TABLE_F_UPDATE)
+ return -EINVAL;
if (flags & NFT_CHAIN_BINDING)
return -EOPNOTSUPP;
- err = nft_chain_parse_hook(net, nla, &hook, family, extack,
- true);
+ err = nft_chain_parse_hook(net, NULL, nla, &hook, family, flags,
+ extack);
if (err < 0)
return err;
@@ -2243,10 +2701,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
if (nla[NFTA_CHAIN_COUNTERS]) {
stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
- if (IS_ERR(stats)) {
+ if (IS_ERR_PCPU(stats)) {
nft_chain_release_hook(&hook);
kfree(basechain);
- return PTR_ERR(stats);
+ return PTR_ERR_PCPU(stats);
}
rcu_assign_pointer(basechain->stats, stats);
}
@@ -2304,8 +2762,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]);
}
- data_size = offsetof(struct nft_rule_dp, data); /* last rule */
- blob = nf_tables_chain_alloc_rules(data_size);
+ blob = nf_tables_chain_alloc_rules(chain, 0);
if (!blob) {
err = -ENOMEM;
goto err_destroy_chain;
@@ -2314,14 +2771,15 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
RCU_INIT_POINTER(chain->blob_gen_0, blob);
RCU_INIT_POINTER(chain->blob_gen_1, blob);
- err = nf_tables_register_hook(net, table, chain);
- if (err < 0)
+ if (!nft_use_inc(&table->use)) {
+ err = -EMFILE;
goto err_destroy_chain;
+ }
trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- goto err_unregister_hook;
+ goto err_trans;
}
nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET;
@@ -2329,81 +2787,96 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
nft_trans_chain_policy(trans) = policy;
err = nft_chain_add(table, chain);
- if (err < 0) {
- nft_trans_destroy(trans);
- goto err_unregister_hook;
- }
+ if (err < 0)
+ goto err_chain_add;
- table->use++;
+ /* This must be LAST to ensure no packets are walking over this chain. */
+ err = nf_tables_register_hook(net, table, chain);
+ if (err < 0)
+ goto err_register_hook;
return 0;
-err_unregister_hook:
- nf_tables_unregister_hook(net, table, chain);
+
+err_register_hook:
+ nft_chain_del(chain);
+err_chain_add:
+ nft_trans_destroy(trans);
+err_trans:
+ nft_use_dec_restore(&table->use);
err_destroy_chain:
- nf_tables_chain_destroy(ctx);
+ nf_tables_chain_destroy(chain);
return err;
}
-static bool nft_hook_list_equal(struct list_head *hook_list1,
- struct list_head *hook_list2)
-{
- struct nft_hook *hook;
- int n = 0, m = 0;
-
- n = 0;
- list_for_each_entry(hook, hook_list2, list) {
- if (!nft_hook_list_find(hook_list1, hook))
- return false;
-
- n++;
- }
- list_for_each_entry(hook, hook_list1, list)
- m++;
-
- return n == m;
-}
-
static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
u32 flags, const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
+ struct nft_base_chain *basechain = NULL;
struct nft_table *table = ctx->table;
struct nft_chain *chain = ctx->chain;
- struct nft_base_chain *basechain;
- struct nft_stats *stats = NULL;
- struct nft_chain_hook hook;
+ struct nft_chain_hook hook = {};
+ struct nft_stats __percpu *stats = NULL;
+ struct nftables_pernet *nft_net;
+ struct nft_hook *h, *next;
struct nf_hook_ops *ops;
struct nft_trans *trans;
+ bool unregister = false;
int err;
if (chain->flags ^ flags)
return -EOPNOTSUPP;
+ INIT_LIST_HEAD(&hook.list);
+
if (nla[NFTA_CHAIN_HOOK]) {
if (!nft_is_base_chain(chain)) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
- err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
- extack, false);
+
+ basechain = nft_base_chain(chain);
+ err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook,
+ ctx->family, flags, extack);
if (err < 0)
return err;
- basechain = nft_base_chain(chain);
if (basechain->type != hook.type) {
nft_chain_release_hook(&hook);
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
- if (nft_base_chain_netdev(ctx->family, hook.num)) {
- if (!nft_hook_list_equal(&basechain->hook_list,
- &hook.list)) {
- nft_chain_release_hook(&hook);
- NL_SET_BAD_ATTR(extack, attr);
- return -EEXIST;
+ if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
+ list_for_each_entry_safe(h, next, &hook.list, list) {
+ list_for_each_entry(ops, &h->ops_list, list) {
+ ops->pf = basechain->ops.pf;
+ ops->hooknum = basechain->ops.hooknum;
+ ops->priority = basechain->ops.priority;
+ ops->priv = basechain->ops.priv;
+ ops->hook = basechain->ops.hook;
+ }
+
+ if (nft_hook_list_find(&basechain->hook_list, h)) {
+ list_del(&h->list);
+ nft_netdev_hook_free(h);
+ continue;
+ }
+
+ nft_net = nft_pernet(ctx->net);
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->msg_type != NFT_MSG_NEWCHAIN ||
+ trans->table != ctx->table ||
+ !nft_trans_chain_update(trans))
+ continue;
+
+ if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) {
+ nft_chain_release_hook(&hook);
+ return -EEXIST;
+ }
+ }
}
} else {
ops = &basechain->ops;
@@ -2414,7 +2887,6 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
return -EEXIST;
}
}
- nft_chain_release_hook(&hook);
}
if (nla[NFTA_CHAIN_HANDLE] &&
@@ -2425,24 +2897,50 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
nla[NFTA_CHAIN_NAME], genmask);
if (!IS_ERR(chain2)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
- return -EEXIST;
+ err = -EEXIST;
+ goto err_hooks;
+ }
+ }
+
+ if (table->flags & __NFT_TABLE_F_UPDATE &&
+ !list_empty(&hook.list)) {
+ NL_SET_BAD_ATTR(extack, attr);
+ err = -EOPNOTSUPP;
+ goto err_hooks;
+ }
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ nft_is_base_chain(chain) &&
+ !list_empty(&hook.list)) {
+ basechain = nft_base_chain(chain);
+ ops = &basechain->ops;
+
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
+ err = nft_netdev_register_hooks(ctx->net, &hook.list);
+ if (err < 0)
+ goto err_hooks;
+
+ unregister = true;
}
}
if (nla[NFTA_CHAIN_COUNTERS]) {
- if (!nft_is_base_chain(chain))
- return -EOPNOTSUPP;
+ if (!nft_is_base_chain(chain)) {
+ err = -EOPNOTSUPP;
+ goto err_hooks;
+ }
stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
- if (IS_ERR(stats))
- return PTR_ERR(stats);
+ if (IS_ERR_PCPU(stats)) {
+ err = PTR_ERR_PCPU(stats);
+ goto err_hooks;
+ }
}
err = -ENOMEM;
- trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN,
- sizeof(struct nft_trans_chain));
+ trans = nft_trans_alloc_chain(ctx, NFT_MSG_NEWCHAIN);
if (trans == NULL)
- goto err;
+ goto err_trans;
nft_trans_chain_stats(trans) = stats;
nft_trans_chain_update(trans) = true;
@@ -2461,47 +2959,67 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
err = -ENOMEM;
name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
if (!name)
- goto err;
+ goto err_trans;
err = -EEXIST;
list_for_each_entry(tmp, &nft_net->commit_list, list) {
if (tmp->msg_type == NFT_MSG_NEWCHAIN &&
- tmp->ctx.table == table &&
+ tmp->table == table &&
nft_trans_chain_update(tmp) &&
nft_trans_chain_name(tmp) &&
strcmp(name, nft_trans_chain_name(tmp)) == 0) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
kfree(name);
- goto err;
+ goto err_trans;
}
}
nft_trans_chain_name(trans) = name;
}
+
+ nft_trans_basechain(trans) = basechain;
+ INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
+ list_splice(&hook.list, &nft_trans_chain_hooks(trans));
+ if (nla[NFTA_CHAIN_HOOK])
+ module_put(hook.type->owner);
+
nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
-err:
+
+err_trans:
free_percpu(stats);
kfree(trans);
+err_hooks:
+ if (nla[NFTA_CHAIN_HOOK]) {
+ list_for_each_entry_safe(h, next, &hook.list, list) {
+ if (unregister) {
+ list_for_each_entry(ops, &h->ops_list, list)
+ nf_unregister_net_hook(ctx->net, ops);
+ }
+ list_del(&h->list);
+ nft_netdev_hook_free_rcu(h);
+ }
+ module_put(hook.type->owner);
+ }
+
return err;
}
static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
const struct nft_table *table,
- const struct nlattr *nla)
+ const struct nlattr *nla, u8 genmask)
{
struct nftables_pernet *nft_net = nft_pernet(net);
u32 id = ntohl(nla_get_be32(nla));
struct nft_trans *trans;
list_for_each_entry(trans, &nft_net->commit_list, list) {
- struct nft_chain *chain = trans->ctx.chain;
-
if (trans->msg_type == NFT_MSG_NEWCHAIN &&
- chain->table == table &&
- id == nft_trans_chain_id(trans))
- return chain;
+ nft_trans_chain(trans)->table == table &&
+ id == nft_trans_chain_id(trans) &&
+ nft_active_genmask(nft_trans_chain(trans), genmask))
+ return nft_trans_chain(trans);
}
return ERR_PTR(-ENOENT);
}
@@ -2604,7 +3122,59 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
extack);
}
- return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack);
+ return nf_tables_addchain(&ctx, family, policy, flags, extack);
+}
+
+static int nft_delchain_hook(struct nft_ctx *ctx,
+ struct nft_base_chain *basechain,
+ struct netlink_ext_ack *extack)
+{
+ const struct nft_chain *chain = &basechain->chain;
+ const struct nlattr * const *nla = ctx->nla;
+ struct nft_chain_hook chain_hook = {};
+ struct nft_hook *this, *hook;
+ LIST_HEAD(chain_del_list);
+ struct nft_trans *trans;
+ int err;
+
+ if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
+ return -EOPNOTSUPP;
+
+ err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook,
+ ctx->family, chain->flags, extack);
+ if (err < 0)
+ return err;
+
+ list_for_each_entry(this, &chain_hook.list, list) {
+ hook = nft_hook_list_find(&basechain->hook_list, this);
+ if (!hook) {
+ err = -ENOENT;
+ goto err_chain_del_hook;
+ }
+ list_move(&hook->list, &chain_del_list);
+ }
+
+ trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN);
+ if (!trans) {
+ err = -ENOMEM;
+ goto err_chain_del_hook;
+ }
+
+ nft_trans_basechain(trans) = basechain;
+ nft_trans_chain_update(trans) = true;
+ INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
+ list_splice(&chain_del_list, &nft_trans_chain_hooks(trans));
+ nft_chain_release_hook(&chain_hook);
+
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
+ return 0;
+
+err_chain_del_hook:
+ list_splice(&chain_del_list, &basechain->hook_list);
+ nft_chain_release_hook(&chain_hook);
+
+ return err;
}
static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
@@ -2639,16 +3209,36 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
chain = nft_chain_lookup(net, table, attr, genmask);
}
if (IS_ERR(chain)) {
+ if (PTR_ERR(chain) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(chain);
}
+ if (nft_chain_binding(chain))
+ return -EOPNOTSUPP;
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
+
+ if (nla[NFTA_CHAIN_HOOK]) {
+ if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN ||
+ chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ return -EOPNOTSUPP;
+
+ if (nft_is_base_chain(chain)) {
+ struct nft_base_chain *basechain = nft_base_chain(chain);
+
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
+ return nft_delchain_hook(&ctx, basechain, extack);
+ }
+ }
+
if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
chain->use > 0)
return -EBUSY;
- nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
-
use = chain->use;
list_for_each_entry(rule, &chain->rules, list) {
if (!nft_is_active_next(net, rule))
@@ -2684,6 +3274,9 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
*/
int nft_register_expr(struct nft_expr_type *type)
{
+ if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR))
+ return -ENOMEM;
+
nfnl_lock(NFNL_SUBSYS_NFTABLES);
if (type->family == NFPROTO_UNSPEC)
list_add_tail_rcu(&type->list, &nf_tables_expressions);
@@ -2713,7 +3306,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family,
{
const struct nft_expr_type *type, *candidate = NULL;
- list_for_each_entry(type, &nf_tables_expressions, list) {
+ list_for_each_entry_rcu(type, &nf_tables_expressions, list) {
if (!nla_strcmp(nla, type->name)) {
if (!type->family && !candidate)
candidate = type;
@@ -2745,9 +3338,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net,
if (nla == NULL)
return ERR_PTR(-EINVAL);
+ rcu_read_lock();
type = __nft_expr_type_get(family, nla);
- if (type != NULL && try_module_get(type->owner))
+ if (type != NULL && try_module_get(type->owner)) {
+ rcu_read_unlock();
return type;
+ }
+ rcu_read_unlock();
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
@@ -2881,28 +3478,40 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
if (err < 0)
return err;
- if (!tb[NFTA_EXPR_DATA])
+ if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME])
return -EINVAL;
+ rcu_read_lock();
+
type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
- if (!type)
- return -ENOENT;
+ if (!type) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
- if (!type->inner_ops)
- return -EOPNOTSUPP;
+ if (!type->inner_ops) {
+ err = -EOPNOTSUPP;
+ goto out_unlock;
+ }
err = nla_parse_nested_deprecated(info->tb, type->maxattr,
tb[NFTA_EXPR_DATA],
type->policy, NULL);
if (err < 0)
- goto err_nla_parse;
+ goto out_unlock;
info->attr = nla;
info->ops = type->inner_ops;
+ /* No module reference will be taken on type->owner.
+ * Presence of type->inner_ops implies that the expression
+ * is builtin, so it cannot go away.
+ */
+ rcu_read_unlock();
return 0;
-err_nla_parse:
+out_unlock:
+ rcu_read_unlock();
return err;
}
@@ -2974,18 +3583,17 @@ err_expr_parse:
return ERR_PTR(err);
}
-int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
+int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp)
{
int err;
- if (src->ops->clone) {
- dst->ops = src->ops;
- err = src->ops->clone(dst, src);
- if (err < 0)
- return err;
- } else {
- memcpy(dst, src, src->ops->size);
- }
+ if (WARN_ON_ONCE(!src->ops->clone))
+ return -EINVAL;
+
+ dst->ops = src->ops;
+ err = src->ops->clone(dst, src, gfp);
+ if (err < 0)
+ return err;
__module_get(src->ops->type->owner);
@@ -3002,13 +3610,15 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
* Rules
*/
-static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
+static struct nft_rule *__nft_rule_lookup(const struct net *net,
+ const struct nft_chain *chain,
u64 handle)
{
struct nft_rule *rule;
// FIXME: this sucks
- list_for_each_entry_rcu(rule, &chain->rules, list) {
+ list_for_each_entry_rcu(rule, &chain->rules, list,
+ lockdep_commit_lock_is_held(net)) {
if (handle == rule->handle)
return rule;
}
@@ -3016,13 +3626,14 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
return ERR_PTR(-ENOENT);
}
-static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain,
+static struct nft_rule *nft_rule_lookup(const struct net *net,
+ const struct nft_chain *chain,
const struct nlattr *nla)
{
if (nla == NULL)
return ERR_PTR(-EINVAL);
- return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+ return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla)));
}
static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
@@ -3031,7 +3642,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
[NFTA_RULE_CHAIN] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
[NFTA_RULE_HANDLE] = { .type = NLA_U64 },
- [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_RULE_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
[NFTA_RULE_COMPAT] = { .type = NLA_NESTED },
[NFTA_RULE_POSITION] = { .type = NLA_U64 },
[NFTA_RULE_USERDATA] = { .type = NLA_BINARY,
@@ -3055,7 +3666,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0,
- nft_base_seq(net));
+ nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
@@ -3143,33 +3754,44 @@ err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
+static void audit_log_rule_reset(const struct nft_table *table,
+ unsigned int base_seq,
+ unsigned int nentries)
+{
+ char *buf = kasprintf(GFP_ATOMIC, "%s:%u",
+ table->name, base_seq);
+
+ audit_log_nfcfg(buf, table->family, nentries,
+ AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
+ kfree(buf);
+}
+
struct nft_rule_dump_ctx {
+ unsigned int s_idx;
char *table;
char *chain;
+ bool reset;
};
static int __nf_tables_dump_rules(struct sk_buff *skb,
unsigned int *idx,
struct netlink_callback *cb,
const struct nft_table *table,
- const struct nft_chain *chain,
- bool reset)
+ const struct nft_chain *chain)
{
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
const struct nft_rule *rule, *prule;
- unsigned int s_idx = cb->args[0];
+ unsigned int entries = 0;
+ int ret = 0;
u64 handle;
prule = NULL;
list_for_each_entry_rcu(rule, &chain->rules, list) {
if (!nft_is_active(net, rule))
goto cont_skip;
- if (*idx < s_idx)
+ if (*idx < ctx->s_idx)
goto cont;
- if (*idx > s_idx) {
- memset(&cb->args[1], 0,
- sizeof(cb->args) - sizeof(cb->args[0]));
- }
if (prule)
handle = prule->handle;
else
@@ -3180,46 +3802,48 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
NFT_MSG_NEWRULE,
NLM_F_MULTI | NLM_F_APPEND,
table->family,
- table, chain, rule, handle, reset) < 0)
- return 1;
-
+ table, chain, rule, handle, ctx->reset) < 0) {
+ ret = 1;
+ break;
+ }
+ entries++;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
prule = rule;
cont_skip:
(*idx)++;
}
- return 0;
+
+ if (ctx->reset && entries)
+ audit_log_rule_reset(table, cb->seq, entries);
+
+ return ret;
}
static int nf_tables_dump_rules(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- const struct nft_rule_dump_ctx *ctx = cb->data;
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
struct nft_table *table;
const struct nft_chain *chain;
unsigned int idx = 0;
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nftables_pernet *nft_net;
- bool reset = false;
-
- if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
- reset = true;
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
- if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0)
+ if (ctx->table && strcmp(ctx->table, table->name) != 0)
continue;
- if (ctx && ctx->table && ctx->chain) {
+ if (ctx->table && ctx->chain) {
struct rhlist_head *list, *tmp;
list = rhltable_lookup(&table->chains_ht, ctx->chain,
@@ -3231,7 +3855,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
if (!nft_is_active(net, chain))
continue;
__nf_tables_dump_rules(skb, &idx,
- cb, table, chain, reset);
+ cb, table, chain);
break;
}
goto done;
@@ -3239,68 +3863,81 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
list_for_each_entry_rcu(chain, &table->chains, list) {
if (__nf_tables_dump_rules(skb, &idx,
- cb, table, chain, reset))
+ cb, table, chain))
goto done;
}
- if (ctx && ctx->table)
+ if (ctx->table)
break;
}
done:
rcu_read_unlock();
- cb->args[0] = idx;
+ ctx->s_idx = idx;
return skb->len;
}
+static int nf_tables_dumpreset_rules(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+ int ret;
+
+ /* Mutex is held is to prevent that two concurrent dump-and-reset calls
+ * do not underrun counters and quotas. The commit_mutex is used for
+ * the lack a better lock, this is not transaction path.
+ */
+ mutex_lock(&nft_net->commit_mutex);
+ ret = nf_tables_dump_rules(skb, cb);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return ret;
+}
+
static int nf_tables_dump_rules_start(struct netlink_callback *cb)
{
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
const struct nlattr * const *nla = cb->data;
- struct nft_rule_dump_ctx *ctx = NULL;
- if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
- ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
- if (!ctx)
- return -ENOMEM;
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
- if (nla[NFTA_RULE_TABLE]) {
- ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
- GFP_ATOMIC);
- if (!ctx->table) {
- kfree(ctx);
- return -ENOMEM;
- }
- }
- if (nla[NFTA_RULE_CHAIN]) {
- ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
- GFP_ATOMIC);
- if (!ctx->chain) {
- kfree(ctx->table);
- kfree(ctx);
- return -ENOMEM;
- }
+ if (nla[NFTA_RULE_TABLE]) {
+ ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC);
+ if (!ctx->table)
+ return -ENOMEM;
+ }
+ if (nla[NFTA_RULE_CHAIN]) {
+ ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC);
+ if (!ctx->chain) {
+ kfree(ctx->table);
+ return -ENOMEM;
}
}
-
- cb->data = ctx;
return 0;
}
+static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb)
+{
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
+
+ ctx->reset = true;
+
+ return nf_tables_dump_rules_start(cb);
+}
+
static int nf_tables_dump_rules_done(struct netlink_callback *cb)
{
- struct nft_rule_dump_ctx *ctx = cb->data;
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
- if (ctx) {
- kfree(ctx->table);
- kfree(ctx->chain);
- kfree(ctx);
- }
+ kfree(ctx->table);
+ kfree(ctx->chain);
return 0;
}
-/* called with rcu_read_lock held */
-static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
- const struct nlattr * const nla[])
+/* Caller must hold rcu read lock or transaction mutex */
+static struct sk_buff *
+nf_tables_getrule_single(u32 portid, const struct nfnl_info *info,
+ const struct nlattr * const nla[], bool reset)
{
struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_cur(info->net);
@@ -3310,61 +3947,113 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
struct net *net = info->net;
struct nft_table *table;
struct sk_buff *skb2;
- bool reset = false;
int err;
- if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
- struct netlink_dump_control c = {
- .start= nf_tables_dump_rules_start,
- .dump = nf_tables_dump_rules,
- .done = nf_tables_dump_rules_done,
- .module = THIS_MODULE,
- .data = (void *)nla,
- };
-
- return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
- }
-
table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
- return PTR_ERR(table);
+ return ERR_CAST(table);
}
chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
- return PTR_ERR(chain);
+ return ERR_CAST(chain);
}
- rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
- return PTR_ERR(rule);
+ return ERR_CAST(rule);
}
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
- return -ENOMEM;
-
- if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
- reset = true;
+ return ERR_PTR(-ENOMEM);
- err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
+ err = nf_tables_fill_rule_info(skb2, net, portid,
info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
family, table, chain, rule, 0, reset);
- if (err < 0)
- goto err_fill_rule_info;
+ if (err < 0) {
+ kfree_skb(skb2);
+ return ERR_PTR(err);
+ }
- return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+ return skb2;
+}
-err_fill_rule_info:
- kfree_skb(skb2);
- return err;
+static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net *net = info->net;
+ struct sk_buff *skb2;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start= nf_tables_dump_rules_start,
+ .dump = nf_tables_dump_rules,
+ .done = nf_tables_dump_rules_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ skb2 = nf_tables_getrule_single(portid, info, nla, false);
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ return nfnetlink_unicast(skb2, net, portid);
+}
+
+static int nf_tables_getrule_reset(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net *net = info->net;
+ struct sk_buff *skb2;
+ char *buf;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start= nf_tables_dumpreset_rules_start,
+ .dump = nf_tables_dumpreset_rules,
+ .done = nf_tables_dump_rules_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+ rcu_read_unlock();
+ mutex_lock(&nft_net->commit_mutex);
+ skb2 = nf_tables_getrule_single(portid, info, nla, true);
+ mutex_unlock(&nft_net->commit_mutex);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
+ nla_len(nla[NFTA_RULE_TABLE]),
+ (char *)nla_data(nla[NFTA_RULE_TABLE]),
+ nft_base_seq(net));
+ audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
+ AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
+ kfree(buf);
+
+ return nfnetlink_unicast(skb2, net, portid);
}
-static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule)
{
struct nft_expr *expr, *next;
@@ -3381,16 +4070,27 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
kfree(rule);
}
-void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule)
+/* can only be used if rule is no longer visible to dumps */
+static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule)
{
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net));
+
nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE);
nf_tables_rule_destroy(ctx, rule);
}
+/** nft_chain_validate - loop detection and hook validation
+ *
+ * @ctx: context containing call depth and base chain
+ * @chain: chain to validate
+ *
+ * Walk through the rules of the given chain and chase all jumps/gotos
+ * and set lookups until either the jump limit is hit or all reachable
+ * chains have been validated.
+ */
int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
{
struct nft_expr *expr, *last;
- const struct nft_data *data;
struct nft_rule *rule;
int err;
@@ -3398,6 +4098,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
return -EMLINK;
list_for_each_entry(rule, &chain->rules, list) {
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
if (!nft_is_active_next(ctx->net, rule))
continue;
@@ -3405,12 +4108,13 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
if (!expr->ops->validate)
continue;
- err = expr->ops->validate(ctx, expr, &data);
+ /* This may call nft_chain_validate() recursively,
+ * callers that do so must increment ctx->level.
+ */
+ err = expr->ops->validate(ctx, expr);
if (err < 0)
return err;
}
-
- cond_resched();
}
return 0;
@@ -3434,11 +4138,70 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
err = nft_chain_validate(&ctx, chain);
if (err < 0)
return err;
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+ struct nft_ctx *pctx = (struct nft_ctx *)ctx;
+ const struct nft_data *data;
+ int err;
+
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+ return 0;
+
+ data = nft_set_ext_data(ext);
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ pctx->level++;
+ err = nft_chain_validate(ctx, data->verdict.chain);
+ if (err < 0)
+ return err;
+ pctx->level--;
+ break;
+ default:
+ break;
}
return 0;
}
+int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ struct nft_set_iter dummy_iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ };
+ struct nft_set_elem_catchall *catchall;
+
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list,
+ lockdep_commit_lock_is_held(ctx->net)) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, dummy_iter.genmask))
+ continue;
+
+ ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem);
+ if (ret < 0)
+ return ret;
+ }
+
+ return ret;
+}
+
static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
const struct nft_chain *chain,
const struct nlattr *nla);
@@ -3483,11 +4246,10 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
}
- if (nft_chain_is_bound(chain))
- return -EOPNOTSUPP;
} else if (nla[NFTA_RULE_CHAIN_ID]) {
- chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID]);
+ chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID],
+ genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]);
return PTR_ERR(chain);
@@ -3496,9 +4258,12 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
return -EINVAL;
}
+ if (nft_chain_is_bound(chain))
+ return -EOPNOTSUPP;
+
if (nla[NFTA_RULE_HANDLE]) {
handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
- rule = __nft_rule_lookup(chain, handle);
+ rule = __nft_rule_lookup(net, chain, handle);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
@@ -3518,12 +4283,9 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
return -EINVAL;
handle = nf_tables_alloc_handle(table);
- if (chain->use == UINT_MAX)
- return -EOVERFLOW;
-
if (nla[NFTA_RULE_POSITION]) {
pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
- old_rule = __nft_rule_lookup(chain, pos_handle);
+ old_rule = __nft_rule_lookup(net, chain, pos_handle);
if (IS_ERR(old_rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
return PTR_ERR(old_rule);
@@ -3600,7 +4362,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
}
if (expr_info[i].ops->validate)
- nft_validate_state_update(net, NFT_VALIDATE_NEED);
+ nft_validate_state_update(table, NFT_VALIDATE_NEED);
expr_info[i].ops = NULL;
expr = nft_expr_next(expr);
@@ -3614,7 +4376,17 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
}
}
+ if (!nft_use_inc(&chain->use)) {
+ err = -EMFILE;
+ goto err_release_rule;
+ }
+
if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (nft_chain_binding(chain)) {
+ err = -EOPNOTSUPP;
+ goto err_destroy_flow_rule;
+ }
+
err = nft_delrule(&ctx, old_rule);
if (err < 0)
goto err_destroy_flow_rule;
@@ -3645,21 +4417,22 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
}
}
kvfree(expr_info);
- chain->use++;
if (flow)
nft_trans_flow_rule(trans) = flow;
- if (nft_net->validate_state == NFT_VALIDATE_DO)
+ if (table->validate_state == NFT_VALIDATE_DO)
return nft_table_validate(net, table);
return 0;
err_destroy_flow_rule:
+ nft_use_dec_restore(&chain->use);
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
- nf_tables_rule_release(&ctx, rule);
+ nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR);
+ nf_tables_rule_destroy(&ctx, rule);
err_release_expr:
for (i = 0; i < n; i++) {
if (expr_info[i].ops) {
@@ -3682,12 +4455,10 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
struct nft_trans *trans;
list_for_each_entry(trans, &nft_net->commit_list, list) {
- struct nft_rule *rule = nft_trans_rule(trans);
-
if (trans->msg_type == NFT_MSG_NEWRULE &&
- trans->ctx.chain == chain &&
+ nft_trans_rule_chain(trans) == chain &&
id == nft_trans_rule_id(trans))
- return rule;
+ return nft_trans_rule(trans);
}
return ERR_PTR(-ENOENT);
}
@@ -3716,10 +4487,14 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
genmask);
if (IS_ERR(chain)) {
+ if (PTR_ERR(chain) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
}
- if (nft_chain_is_bound(chain))
+ if (nft_chain_binding(chain))
return -EOPNOTSUPP;
}
@@ -3727,8 +4502,12 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
if (chain) {
if (nla[NFTA_RULE_HANDLE]) {
- rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]);
if (IS_ERR(rule)) {
+ if (PTR_ERR(rule) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
}
@@ -3749,6 +4528,8 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
list_for_each_entry(chain, &table->chains, list) {
if (!nft_is_active_next(net, chain))
continue;
+ if (nft_chain_binding(chain))
+ continue;
ctx.chain = chain;
err = nft_delrule_by_chain(&ctx);
@@ -3790,23 +4571,18 @@ static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags)
* given, in that case the amount of memory per element is used.
*/
static const struct nft_set_ops *
-nft_select_set_ops(const struct nft_ctx *ctx,
- const struct nlattr * const nla[],
+nft_select_set_ops(const struct nft_ctx *ctx, u32 flags,
const struct nft_set_desc *desc)
{
struct nftables_pernet *nft_net = nft_pernet(ctx->net);
const struct nft_set_ops *ops, *bops;
struct nft_set_estimate est, best;
const struct nft_set_type *type;
- u32 flags = 0;
int i;
lockdep_assert_held(&nft_net->commit_mutex);
lockdep_nfnl_nft_mutex_not_held();
- if (nla[NFTA_SET_FLAGS] != NULL)
- flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
-
bops = NULL;
best.size = ~0;
best.lookup = ~0;
@@ -3874,15 +4650,22 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_SET_HANDLE] = { .type = NLA_U64 },
[NFTA_SET_EXPR] = { .type = NLA_NESTED },
- [NFTA_SET_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
+ [NFTA_SET_TYPE] = { .type = NLA_REJECT },
+ [NFTA_SET_COUNT] = { .type = NLA_REJECT },
+};
+
+static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
+ [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
[NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
- [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED },
+ [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy),
};
-static struct nft_set *nft_set_lookup(const struct nft_table *table,
+static struct nft_set *nft_set_lookup(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
struct nft_set *set;
@@ -3890,7 +4673,8 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table,
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry_rcu(set, &table->sets, list) {
+ list_for_each_entry_rcu(set, &table->sets, list,
+ lockdep_commit_lock_is_held(net)) {
if (!nla_strcmp(nla, set->name) &&
nft_active_genmask(set, genmask))
return set;
@@ -3918,17 +4702,16 @@ static struct nft_set *nft_set_lookup_byid(const struct net *net,
{
struct nftables_pernet *nft_net = nft_pernet(net);
u32 id = ntohl(nla_get_be32(nla));
- struct nft_trans *trans;
+ struct nft_trans_set *trans;
- list_for_each_entry(trans, &nft_net->commit_list, list) {
- if (trans->msg_type == NFT_MSG_NEWSET) {
- struct nft_set *set = nft_trans_set(trans);
+ /* its likely the id we need is at the tail, not at start */
+ list_for_each_entry_reverse(trans, &nft_net->commit_set_list, list_trans_newset) {
+ struct nft_set *set = trans->set;
- if (id == nft_trans_set_id(trans) &&
- set->table == table &&
- nft_active_genmask(set, genmask))
- return set;
- }
+ if (id == trans->set_id &&
+ set->table == table &&
+ nft_active_genmask(set, genmask))
+ return set;
}
return ERR_PTR(-ENOENT);
}
@@ -3941,7 +4724,7 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
{
struct nft_set *set;
- set = nft_set_lookup(table, nla_set_name, genmask);
+ set = nft_set_lookup(net, table, nla_set_name, genmask);
if (IS_ERR(set)) {
if (!nla_set_id)
return set;
@@ -3965,6 +4748,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
+ if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN)
+ return -EINVAL;
+
inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (inuse == NULL)
return -ENOMEM;
@@ -4017,7 +4803,7 @@ int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
return -ERANGE;
ms *= NSEC_PER_MSEC;
- *result = nsecs_to_jiffies64(ms);
+ *result = nsecs_to_jiffies64(ms) ? : !!ms;
return 0;
}
@@ -4053,6 +4839,35 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb,
return 0;
}
+static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size)
+{
+ if (ops->usize)
+ return ops->usize(size);
+
+ return size;
+}
+
+static noinline_for_stack int
+nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set)
+{
+ unsigned int nelems;
+ char str[40];
+ int ret;
+
+ ret = snprintf(str, sizeof(str), "%ps", set->ops);
+
+ /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped
+ * to userspace purely for informational/debug purposes.
+ */
+ DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str));
+
+ if (nla_put_string(skb, NFTA_SET_TYPE, str))
+ return -EMSGSIZE;
+
+ nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems));
+ return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems));
+}
+
static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
const struct nft_set *set, u16 event, u16 flags)
{
@@ -4064,9 +4879,10 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
u32 seq = ctx->seq;
int i;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
- NFNETLINK_V0, nft_base_seq(ctx->net));
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, ctx->family, NFNETLINK_V0,
+ nft_base_seq_be16(ctx->net));
if (!nlh)
goto nla_put_failure;
@@ -4077,6 +4893,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle),
NFTA_SET_PAD))
goto nla_put_failure;
+
+ if (event == NFT_MSG_DELSET ||
+ event == NFT_MSG_DESTROYSET) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
if (set->flags != 0)
if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
goto nla_put_failure;
@@ -4117,7 +4940,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (!nest)
goto nla_put_failure;
if (set->size &&
- nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
+ nla_put_be32(skb, NFTA_SET_DESC_SIZE,
+ htonl(nft_set_userspace_size(set->ops, set->size))))
goto nla_put_failure;
if (set->field_count > 1 &&
@@ -4126,6 +4950,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
nla_nest_end(skb, nest);
+ if (nf_tables_fill_set_info(skb, set))
+ goto nla_put_failure;
+
if (set->num_exprs == 1) {
nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0)
@@ -4200,7 +5027,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (ctx->family != NFPROTO_UNSPEC &&
@@ -4308,9 +5135,11 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info,
if (!nla[NFTA_SET_TABLE])
return -EINVAL;
- set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
- if (IS_ERR(set))
+ set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask);
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return PTR_ERR(set);
+ }
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb2 == NULL)
@@ -4327,10 +5156,6 @@ err_fill_set_info:
return err;
}
-static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
- [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 },
-};
-
static int nft_set_desc_concat_parse(const struct nlattr *attr,
struct nft_set_desc *desc)
{
@@ -4361,8 +5186,8 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr,
static int nft_set_desc_concat(struct nft_set_desc *desc,
const struct nlattr *nla)
{
+ u32 len = 0, num_regs;
struct nlattr *attr;
- u32 num_regs = 0;
int rem, err, i;
nla_for_each_nested(attr, nla, rem) {
@@ -4375,8 +5200,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc,
}
for (i = 0; i < desc->field_count; i++)
- num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32));
+ len += round_up(desc->field_len[i], sizeof(u32));
+ if (len != desc->klen)
+ return -EINVAL;
+
+ num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32));
if (num_regs > NFT_REG32_COUNT)
return -E2BIG;
@@ -4483,6 +5312,15 @@ static bool nft_set_is_same(const struct nft_set *set,
return true;
}
+static u32 nft_set_kernel_size(const struct nft_set_ops *ops,
+ const struct nft_set_desc *desc)
+{
+ if (ops->ksize)
+ return ops->ksize(desc->size);
+
+ return desc->size;
+}
+
static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const nla[])
{
@@ -4538,6 +5376,12 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) ==
(NFT_SET_EVAL | NFT_SET_OBJECT))
return -EOPNOTSUPP;
+ if ((flags & (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT | NFT_SET_EVAL)) ==
+ (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT))
+ return -EOPNOTSUPP;
+ if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) ==
+ (NFT_SET_CONSTANT | NFT_SET_TIMEOUT))
+ return -EOPNOTSUPP;
}
desc.dtype = 0;
@@ -4579,6 +5423,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
if (!(flags & NFT_SET_TIMEOUT))
return -EINVAL;
+ if (flags & NFT_SET_ANONYMOUS)
+ return -EOPNOTSUPP;
+
err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout);
if (err)
return err;
@@ -4587,20 +5434,36 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
if (!(flags & NFT_SET_TIMEOUT))
return -EINVAL;
+
+ if (flags & NFT_SET_ANONYMOUS)
+ return -EOPNOTSUPP;
+
desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL]));
}
desc.policy = NFT_SET_POL_PERFORMANCE;
- if (nla[NFTA_SET_POLICY] != NULL)
+ if (nla[NFTA_SET_POLICY] != NULL) {
desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
+ switch (desc.policy) {
+ case NFT_SET_POL_PERFORMANCE:
+ case NFT_SET_POL_MEMORY:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ }
if (nla[NFTA_SET_DESC] != NULL) {
err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]);
if (err < 0)
return err;
- if (desc.field_count > 1 && !(flags & NFT_SET_CONCAT))
+ if (desc.field_count > 1) {
+ if (!(flags & NFT_SET_CONCAT))
+ return -EINVAL;
+ } else if (flags & NFT_SET_CONCAT) {
return -EINVAL;
+ }
} else if (flags & NFT_SET_CONCAT) {
return -EINVAL;
}
@@ -4617,7 +5480,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
+ set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set)) {
if (PTR_ERR(set) != -ENOENT) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
@@ -4633,10 +5496,16 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
+ if (nft_set_is_anonymous(set))
+ return -EOPNOTSUPP;
+
err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags);
if (err < 0)
return err;
+ if (desc.size)
+ desc.size = nft_set_kernel_size(set->ops, &desc);
+
err = 0;
if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
@@ -4655,10 +5524,13 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
- ops = nft_select_set_ops(&ctx, nla, &desc);
+ ops = nft_select_set_ops(&ctx, flags, &desc);
if (IS_ERR(ops))
return PTR_ERR(ops);
+ if (desc.size)
+ desc.size = nft_set_kernel_size(ops, &desc);
+
udlen = 0;
if (nla[NFTA_SET_USERDATA])
udlen = nla_len(nla[NFTA_SET_USERDATA]);
@@ -4669,9 +5541,15 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
alloc_size = sizeof(*set) + size + udlen;
if (alloc_size < size || alloc_size > INT_MAX)
return -ENOMEM;
+
+ if (!nft_use_inc(&table->use))
+ return -EMFILE;
+
set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT);
- if (!set)
- return -ENOMEM;
+ if (!set) {
+ err = -ENOMEM;
+ goto err_alloc;
+ }
name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT);
if (!name) {
@@ -4692,6 +5570,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
INIT_LIST_HEAD(&set->bindings);
INIT_LIST_HEAD(&set->catchall_list);
+ refcount_set(&set->refs, 1);
set->table = table;
write_pnet(&set->net, net);
set->ops = ops;
@@ -4722,33 +5601,31 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
set->num_exprs = num_exprs;
set->handle = nf_tables_alloc_handle(table);
+ INIT_LIST_HEAD(&set->pending_update);
err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
if (err < 0)
goto err_set_expr_alloc;
list_add_tail_rcu(&set->list, &table->sets);
- table->use++;
+
return 0;
err_set_expr_alloc:
for (i = 0; i < set->num_exprs; i++)
nft_expr_destroy(&ctx, set->exprs[i]);
err_set_destroy:
- ops->destroy(set);
+ ops->destroy(&ctx, set);
err_set_init:
kfree(set->name);
err_set_name:
kvfree(set);
+err_alloc:
+ nft_use_dec_restore(&table->use);
+
return err;
}
-struct nft_set_elem_catchall {
- struct list_head list;
- struct rcu_head rcu;
- void *elem;
-};
-
static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
struct nft_set *set)
{
@@ -4756,11 +5633,19 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
list_del_rcu(&catchall->list);
- nft_set_elem_destroy(set, catchall->elem, true);
+ nf_tables_set_elem_destroy(ctx, set, catchall->elem);
kfree_rcu(catchall, rcu);
}
}
+static void nft_set_put(struct nft_set *set)
+{
+ if (refcount_dec_and_test(&set->refs)) {
+ kfree(set->name);
+ kvfree(set);
+ }
+}
+
static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
int i;
@@ -4771,10 +5656,9 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
for (i = 0; i < set->num_exprs; i++)
nft_expr_destroy(ctx, set->exprs[i]);
- set->ops->destroy(set);
+ set->ops->destroy(ctx, set);
nft_set_catchall_destroy(ctx, set);
- kfree(set->name);
- kvfree(set);
+ nft_set_put(set);
}
static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
@@ -4804,10 +5688,14 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
set = nft_set_lookup_byhandle(table, attr, genmask);
} else {
attr = nla[NFTA_SET_NAME];
- set = nft_set_lookup(table, attr, genmask);
+ set = nft_set_lookup(net, table, attr, genmask);
}
if (IS_ERR(set)) {
+ if (PTR_ERR(set) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(set);
}
@@ -4831,9 +5719,9 @@ static int nft_validate_register_store(const struct nft_ctx *ctx,
static int nft_setelem_data_validate(const struct nft_ctx *ctx,
struct nft_set *set,
- struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
enum nft_registers dreg;
dreg = nft_type_to_reg(set->dtype);
@@ -4846,9 +5734,14 @@ static int nft_setelem_data_validate(const struct nft_ctx *ctx,
static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- return nft_setelem_data_validate(ctx, set, elem);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ return nft_setelem_data_validate(ctx, set, elem_priv);
}
static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
@@ -4856,17 +5749,16 @@ static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
{
u8 genmask = nft_genmask_next(ctx->net);
struct nft_set_elem_catchall *catchall;
- struct nft_set_elem elem;
struct nft_set_ext *ext;
int ret = 0;
- list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list,
+ lockdep_commit_lock_is_held(ctx->net)) {
ext = nft_set_elem_ext(set, catchall->elem);
if (!nft_set_elem_active(ext, genmask))
continue;
- elem.priv = catchall->elem;
- ret = nft_setelem_data_validate(ctx, set, &elem);
+ ret = nft_setelem_data_validate(ctx, set, catchall->elem);
if (ret < 0)
break;
}
@@ -4878,10 +5770,11 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding)
{
struct nft_set_binding *i;
- struct nft_set_iter iter;
-
- if (set->use == UINT_MAX)
- return -EOVERFLOW;
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nf_tables_bind_check_setelem,
+ };
if (!list_empty(&set->bindings) && nft_set_is_anonymous(set))
return -EBUSY;
@@ -4896,12 +5789,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
goto bind;
}
- iter.genmask = nft_genmask_next(ctx->net);
- iter.skip = 0;
- iter.count = 0;
- iter.err = 0;
- iter.fn = nf_tables_bind_check_setelem;
-
set->ops->walk(ctx, set, &iter);
if (!iter.err)
iter.err = nft_set_catchall_bind_check(ctx, set);
@@ -4910,10 +5797,12 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
return iter.err;
}
bind:
+ if (!nft_use_inc(&set->use))
+ return -EMFILE;
+
binding->chain = ctx->chain;
list_add_tail_rcu(&binding->list, &set->bindings);
nft_set_trans_bind(ctx, set);
- set->use++;
return 0;
}
@@ -4926,23 +5815,111 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) {
list_del_rcu(&set->list);
+ set->dead = 1;
if (event)
nf_tables_set_notify(ctx, set, NFT_MSG_DELSET,
GFP_KERNEL);
}
}
+static void nft_setelem_data_activate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv);
+
+static int nft_mapelem_activate(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ /* called from abort path, reverse check to undo changes. */
+ if (nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ nft_clear(ctx->net, ext);
+ nft_setelem_data_activate(ctx->net, set, elem_priv);
+
+ return 0;
+}
+
+static void nft_map_catchall_activate(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ nft_clear(ctx->net, ext);
+ nft_setelem_data_activate(ctx->net, set, catchall->elem);
+ break;
+ }
+}
+
+static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_mapelem_activate,
+ };
+
+ set->ops->walk(ctx, set, &iter);
+ WARN_ON_ONCE(iter.err);
+
+ nft_map_catchall_activate(ctx, set);
+}
+
+void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ if (nft_set_is_anonymous(set)) {
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_activate(ctx, set);
+
+ nft_clear(ctx->net, set);
+ }
+
+ nft_use_inc_restore(&set->use);
+}
+EXPORT_SYMBOL_GPL(nf_tables_activate_set);
+
void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding,
enum nft_trans_phase phase)
{
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net));
+
switch (phase) {
+ case NFT_TRANS_PREPARE_ERROR:
+ nft_set_trans_unbind(ctx, set);
+ if (nft_set_is_anonymous(set))
+ nft_deactivate_next(ctx->net, set);
+ else
+ list_del_rcu(&binding->list);
+
+ nft_use_dec(&set->use);
+ break;
case NFT_TRANS_PREPARE:
- set->use--;
+ if (nft_set_is_anonymous(set)) {
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(ctx, set);
+
+ nft_deactivate_next(ctx->net, set);
+ }
+ nft_use_dec(&set->use);
return;
case NFT_TRANS_ABORT:
case NFT_TRANS_RELEASE:
- set->use--;
+ if (nft_set_is_anonymous(set) &&
+ set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(ctx, set);
+
+ nft_use_dec(&set->use);
fallthrough;
default:
nf_tables_unbind_set(ctx, set, binding,
@@ -4977,12 +5954,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
.align = __alignof__(u8),
},
[NFT_SET_EXT_TIMEOUT] = {
- .len = sizeof(u64),
- .align = __alignof__(u64),
- },
- [NFT_SET_EXT_EXPIRATION] = {
- .len = sizeof(u64),
- .align = __alignof__(u64),
+ .len = sizeof(struct nft_timeout),
+ .align = __alignof__(struct nft_timeout),
},
[NFT_SET_EXT_USERDATA] = {
.len = sizeof(struct nft_userdata),
@@ -5009,7 +5982,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
[NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING,
.len = NFT_OBJ_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED },
- [NFTA_SET_ELEM_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
};
static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
@@ -5017,13 +5990,14 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX +
.len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
- [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_LIST_ELEMENTS] = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy),
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
static int nft_set_elem_expr_dump(struct sk_buff *skb,
const struct nft_set *set,
- const struct nft_set_ext *ext)
+ const struct nft_set_ext *ext,
+ bool reset)
{
struct nft_set_elem_expr *elem_expr;
u32 size, num_exprs = 0;
@@ -5036,7 +6010,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb,
if (num_exprs == 1) {
expr = nft_setelem_expr_at(elem_expr, 0);
- if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0)
+ if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0)
return -1;
return 0;
@@ -5047,7 +6021,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb,
nft_setelem_expr_foreach(expr, elem_expr, size) {
expr = nft_setelem_expr_at(elem_expr, size);
- if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0)
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, nest);
@@ -5060,9 +6034,10 @@ nla_put_failure:
static int nf_tables_fill_setelem(struct sk_buff *skb,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ const struct nft_elem_priv *elem_priv,
+ bool reset)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
@@ -5082,12 +6057,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext),
- set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE,
- set->dlen) < 0)
+ nft_set_datatype(set), set->dlen) < 0)
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) &&
- nft_set_elem_expr_dump(skb, set, ext))
+ nft_set_elem_expr_dump(skb, set, ext, reset))
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
@@ -5100,25 +6074,32 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
htonl(*nft_set_ext_flags(ext))))
goto nla_put_failure;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
- nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
- nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)),
- NFTA_SET_ELEM_PAD))
- goto nla_put_failure;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) {
+ u64 timeout = READ_ONCE(nft_set_ext_timeout(ext)->timeout);
+ u64 set_timeout = READ_ONCE(set->timeout);
+ __be64 msecs = 0;
+
+ if (set_timeout != timeout) {
+ msecs = nf_jiffies64_to_msecs(timeout);
+ if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, msecs,
+ NFTA_SET_ELEM_PAD))
+ goto nla_put_failure;
+ }
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
- u64 expires, now = get_jiffies_64();
+ if (timeout > 0) {
+ u64 expires, now = get_jiffies_64();
- expires = *nft_set_ext_expiration(ext);
- if (time_before64(now, expires))
- expires -= now;
- else
- expires = 0;
+ expires = READ_ONCE(nft_set_ext_timeout(ext)->expiration);
+ if (time_before64(now, expires))
+ expires -= now;
+ else
+ expires = 0;
- if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
- nf_jiffies64_to_msecs(expires),
- NFTA_SET_ELEM_PAD))
- goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
+ nf_jiffies64_to_msecs(expires),
+ NFTA_SET_ELEM_PAD))
+ goto nla_put_failure;
+ }
}
if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) {
@@ -5142,30 +6123,50 @@ struct nft_set_dump_args {
const struct netlink_callback *cb;
struct nft_set_iter iter;
struct sk_buff *skb;
+ bool reset;
};
static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
struct nft_set_dump_args *args;
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext))
+ return 0;
+
args = container_of(iter, struct nft_set_dump_args, iter);
- return nf_tables_fill_setelem(args->skb, set, elem);
+ return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset);
+}
+
+static void audit_log_nft_set_reset(const struct nft_table *table,
+ unsigned int base_seq,
+ unsigned int nentries)
+{
+ char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);
+
+ audit_log_nfcfg(buf, table->family, nentries,
+ AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC);
+ kfree(buf);
}
struct nft_set_dump_ctx {
const struct nft_set *set;
struct nft_ctx ctx;
+ bool reset;
};
static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
- const struct nft_set *set)
+ const struct nft_set *set, bool reset,
+ unsigned int base_seq)
{
struct nft_set_elem_catchall *catchall;
u8 genmask = nft_genmask_cur(net);
- struct nft_set_elem elem;
struct nft_set_ext *ext;
int ret = 0;
@@ -5175,8 +6176,9 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
nft_set_elem_expired(ext))
continue;
- elem.priv = catchall->elem;
- ret = nf_tables_fill_setelem(skb, set, &elem);
+ ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset);
+ if (reset && !ret)
+ audit_log_nft_set_reset(set->table, base_seq, 1);
break;
}
@@ -5190,7 +6192,17 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
struct nftables_pernet *nft_net;
struct nft_table *table;
struct nft_set *set;
- struct nft_set_dump_args args;
+ struct nft_set_dump_args args = {
+ .cb = cb,
+ .skb = skb,
+ .reset = dump_ctx->reset,
+ .iter = {
+ .genmask = nft_genmask_cur(net),
+ .type = NFT_ITER_READ,
+ .skip = cb->args[0],
+ .fn = nf_tables_dump_setelem,
+ },
+ };
bool set_found = false;
struct nlmsghdr *nlh;
struct nlattr *nest;
@@ -5199,7 +6211,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
@@ -5228,7 +6240,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
seq = cb->nlh->nlmsg_seq;
nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI,
- table->family, NFNETLINK_V0, nft_base_seq(net));
+ table->family, NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
@@ -5241,22 +6253,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
if (nest == NULL)
goto nla_put_failure;
- args.cb = cb;
- args.skb = skb;
- args.iter.genmask = nft_genmask_cur(net);
- args.iter.skip = cb->args[0];
- args.iter.count = 0;
- args.iter.err = 0;
- args.iter.fn = nf_tables_dump_setelem;
set->ops->walk(&dump_ctx->ctx, set, &args.iter);
if (!args.iter.err && args.iter.count == cb->args[0])
- args.iter.err = nft_set_catchall_dump(net, skb, set);
- rcu_read_unlock();
-
+ args.iter.err = nft_set_catchall_dump(net, skb, set,
+ dump_ctx->reset, cb->seq);
nla_nest_end(skb, nest);
nlmsg_end(skb, nlh);
+ rcu_read_unlock();
+
if (args.iter.err && args.iter.err != -EMSGSIZE)
return args.iter.err;
if (args.iter.count == cb->args[0])
@@ -5270,6 +6276,26 @@ nla_put_failure:
return -ENOSPC;
}
+static int nf_tables_dumpreset_set(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+ struct nft_set_dump_ctx *dump_ctx = cb->data;
+ int ret, skip = cb->args[0];
+
+ mutex_lock(&nft_net->commit_mutex);
+
+ ret = nf_tables_dump_set(skb, cb);
+
+ if (cb->args[0] > skip)
+ audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
+ cb->args[0] - skip);
+
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return ret;
+}
+
static int nf_tables_dump_set_start(struct netlink_callback *cb)
{
struct nft_set_dump_ctx *dump_ctx = cb->data;
@@ -5289,7 +6315,8 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
const struct nft_ctx *ctx, u32 seq,
u32 portid, int event, u16 flags,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ const struct nft_elem_priv *elem_priv,
+ bool reset)
{
struct nlmsghdr *nlh;
struct nlattr *nest;
@@ -5297,7 +6324,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
- NFNETLINK_V0, nft_base_seq(ctx->net));
+ NFNETLINK_V0, nft_base_seq_be16(ctx->net));
if (!nlh)
goto nla_put_failure;
@@ -5310,7 +6337,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
if (nest == NULL)
goto nla_put_failure;
- err = nf_tables_fill_setelem(skb, set, elem);
+ err = nf_tables_fill_setelem(skb, set, elem_priv, reset);
if (err < 0)
goto nla_put_failure;
@@ -5343,7 +6370,7 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
return 0;
}
-static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set,
+static int nft_setelem_parse_key(struct nft_ctx *ctx, const struct nft_set *set,
struct nft_data *key, struct nlattr *attr)
{
struct nft_data_desc desc = {
@@ -5396,7 +6423,7 @@ static void *nft_setelem_catchall_get(const struct net *net,
return priv;
}
-static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set,
+static int nft_setelem_get(struct nft_ctx *ctx, const struct nft_set *set,
struct nft_set_elem *elem, u32 flags)
{
void *priv;
@@ -5415,8 +6442,8 @@ static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set,
return 0;
}
-static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
- const struct nlattr *attr)
+static int nft_get_set_elem(struct nft_ctx *ctx, const struct nft_set *set,
+ const struct nlattr *attr, bool reset)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
struct nft_set_elem elem;
@@ -5460,7 +6487,8 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return err;
err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid,
- NFT_MSG_NEWSETELEM, 0, set, &elem);
+ NFT_MSG_NEWSETELEM, 0, set, elem.priv,
+ reset);
if (err < 0)
goto err_fill_setelem;
@@ -5471,10 +6499,11 @@ err_fill_setelem:
return err;
}
-/* called with rcu_read_lock held */
-static int nf_tables_getsetelem(struct sk_buff *skb,
- const struct nfnl_info *info,
- const struct nlattr * const nla[])
+static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx,
+ const struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[],
+ bool reset)
{
struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_cur(info->net);
@@ -5482,22 +6511,36 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
struct net *net = info->net;
struct nft_table *table;
struct nft_set *set;
- struct nlattr *attr;
- struct nft_ctx ctx;
- int rem, err = 0;
table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
- genmask, NETLINK_CB(skb).portid);
+ genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
return PTR_ERR(table);
}
- set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
- if (IS_ERR(set))
+ set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
+ }
- nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+ nft_ctx_init(&dump_ctx->ctx, net, skb,
+ info->nlh, family, table, NULL, nla);
+ dump_ctx->set = set;
+ dump_ctx->reset = reset;
+ return 0;
+}
+
+/* called with rcu_read_lock held */
+static int nf_tables_getsetelem(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nft_set_dump_ctx dump_ctx;
+ struct nlattr *attr;
+ int rem, err = 0;
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
@@ -5506,11 +6549,55 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
.done = nf_tables_dump_set_done,
.module = THIS_MODULE,
};
- struct nft_set_dump_ctx dump_ctx = {
- .set = set,
- .ctx = ctx,
+
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false);
+ if (err)
+ return err;
+
+ c.data = &dump_ctx;
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
+ return -EINVAL;
+
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false);
+ if (err)
+ return err;
+
+ nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+ err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int nf_tables_getsetelem_reset(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ struct nft_set_dump_ctx dump_ctx;
+ int rem, err = 0, nelems = 0;
+ struct nlattr *attr;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dump_set_start,
+ .dump = nf_tables_dumpreset_set,
+ .done = nf_tables_dump_set_done,
+ .module = THIS_MODULE,
};
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true);
+ if (err)
+ return err;
+
c.data = &dump_ctx;
return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
@@ -5518,20 +6605,38 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
return -EINVAL;
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+ rcu_read_unlock();
+ mutex_lock(&nft_net->commit_mutex);
+ rcu_read_lock();
+
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true);
+ if (err)
+ goto out_unlock;
+
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
- err = nft_get_set_elem(&ctx, set, attr);
+ err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true);
if (err < 0) {
NL_SET_BAD_ATTR(extack, attr);
break;
}
+ nelems++;
}
+ audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems);
+
+out_unlock:
+ rcu_read_unlock();
+ mutex_unlock(&nft_net->commit_mutex);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
return err;
}
static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
const struct nft_set *set,
- const struct nft_set_elem *elem,
+ const struct nft_elem_priv *elem_priv,
int event)
{
struct nftables_pernet *nft_net;
@@ -5552,7 +6657,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags,
- set, elem);
+ set, elem_priv, false);
if (err < 0) {
kfree_skb(skb);
goto err;
@@ -5565,17 +6670,21 @@ err:
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
-static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
+static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx,
int msg_type,
struct nft_set *set)
{
+ struct nft_trans_elem *te;
struct nft_trans *trans;
- trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem));
+ trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1));
if (trans == NULL)
return NULL;
- nft_trans_elem_set(trans) = set;
+ te = nft_trans_container_elem(trans);
+ te->nelems = 1;
+ te->set = set;
+
return trans;
}
@@ -5627,10 +6736,11 @@ static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id,
return 0;
}
-void *nft_set_elem_init(const struct nft_set *set,
- const struct nft_set_ext_tmpl *tmpl,
- const u32 *key, const u32 *key_end,
- const u32 *data, u64 timeout, u64 expiration, gfp_t gfp)
+struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
+ const struct nft_set_ext_tmpl *tmpl,
+ const u32 *key, const u32 *key_end,
+ const u32 *data,
+ u64 timeout, u64 expiration, gfp_t gfp)
{
struct nft_set_ext *ext;
void *elem;
@@ -5657,13 +6767,14 @@ void *nft_set_elem_init(const struct nft_set *set,
nft_set_ext_data(ext), data, set->dlen) < 0)
goto err_ext_check;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
- *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) {
+ nft_set_ext_timeout(ext)->timeout = timeout;
+
if (expiration == 0)
- *nft_set_ext_expiration(ext) += timeout;
+ expiration = timeout;
+
+ nft_set_ext_timeout(ext)->expiration = get_jiffies_64() + expiration;
}
- if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
- *nft_set_ext_timeout(ext) = timeout;
return elem;
@@ -5694,39 +6805,75 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
__nft_set_elem_expr_destroy(ctx, expr);
}
-void nft_set_elem_destroy(const struct nft_set *set, void *elem,
- bool destroy_expr)
+/* Drop references and destroy. Called from gc, dynset and abort path. */
+static void __nft_set_elem_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv,
+ bool destroy_expr)
{
- struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
- struct nft_ctx ctx = {
- .net = read_pnet(&set->net),
- .family = set->table->family,
- };
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_release(nft_set_ext_data(ext), set->dtype);
if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
- nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext));
-
+ nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
- (*nft_set_ext_obj(ext))->use--;
- kfree(elem);
+ nft_use_dec(&(*nft_set_ext_obj(ext))->use);
+
+ kfree(elem_priv);
+}
+
+/* Drop references and destroy. Called from gc and dynset. */
+void nft_set_elem_destroy(const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv,
+ bool destroy_expr)
+{
+ struct nft_ctx ctx = {
+ .net = read_pnet(&set->net),
+ .family = set->table->family,
+ };
+
+ __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr);
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
-/* Only called from commit path, nft_setelem_data_deactivate() already deals
- * with the refcounting from the preparation phase.
+/* Drop references and destroy. Called from abort path. */
+static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ /* skip update request, see nft_trans_elems_new_abort() */
+ if (!te->elems[i].priv)
+ continue;
+
+ __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true);
+ }
+}
+
+/* Destroy element. References have been already dropped in the preparation
+ * path via nft_setelem_data_deactivate().
*/
-static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
- const struct nft_set *set, void *elem)
+void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv)
{
- struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
- kfree(elem);
+ kfree(elem_priv);
+}
+
+static void nft_trans_elems_destroy(const struct nft_ctx *ctx,
+ const struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++)
+ nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv);
}
int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
@@ -5740,7 +6887,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
if (!expr)
goto err_expr;
- err = nft_expr_clone(expr, set->exprs[i]);
+ err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT);
if (err < 0) {
kfree(expr);
goto err_expr;
@@ -5779,7 +6926,7 @@ static int nft_set_elem_expr_setup(struct nft_ctx *ctx,
for (i = 0; i < num_exprs; i++) {
expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
- err = nft_expr_clone(expr, expr_array[i]);
+ err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT);
if (err < 0)
goto err_elem_expr_setup;
@@ -5809,7 +6956,8 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
if (nft_set_elem_active(ext, genmask) &&
- !nft_set_elem_expired(ext))
+ !nft_set_elem_expired(ext) &&
+ !nft_set_elem_is_dead(ext))
return ext;
}
@@ -5817,33 +6965,10 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
}
EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);
-void *nft_set_catchall_gc(const struct nft_set *set)
-{
- struct nft_set_elem_catchall *catchall, *next;
- struct nft_set_ext *ext;
- void *elem = NULL;
-
- list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
- ext = nft_set_elem_ext(set, catchall->elem);
-
- if (!nft_set_elem_expired(ext) ||
- nft_set_elem_mark_busy(ext))
- continue;
-
- elem = catchall->elem;
- list_del_rcu(&catchall->list);
- kfree_rcu(catchall, rcu);
- break;
- }
-
- return elem;
-}
-EXPORT_SYMBOL_GPL(nft_set_catchall_gc);
-
static int nft_setelem_catchall_insert(const struct net *net,
struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **pext)
+ struct nft_elem_priv **priv)
{
struct nft_set_elem_catchall *catchall;
u8 genmask = nft_genmask_next(net);
@@ -5852,12 +6977,12 @@ static int nft_setelem_catchall_insert(const struct net *net,
list_for_each_entry(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
if (nft_set_elem_active(ext, genmask)) {
- *pext = ext;
+ *priv = catchall->elem;
return -EEXIST;
}
}
- catchall = kmalloc(sizeof(*catchall), GFP_KERNEL);
+ catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT);
if (!catchall)
return -ENOMEM;
@@ -5870,22 +6995,23 @@ static int nft_setelem_catchall_insert(const struct net *net,
static int nft_setelem_insert(const struct net *net,
struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext, unsigned int flags)
+ struct nft_elem_priv **elem_priv,
+ unsigned int flags)
{
int ret;
if (flags & NFT_SET_ELEM_CATCHALL)
- ret = nft_setelem_catchall_insert(net, set, elem, ext);
+ ret = nft_setelem_catchall_insert(net, set, elem, elem_priv);
else
- ret = set->ops->insert(net, set, elem, ext);
+ ret = set->ops->insert(net, set, elem, elem_priv);
return ret;
}
static bool nft_setelem_is_catchall(const struct nft_set *set,
- const struct nft_set_elem *elem)
+ const struct nft_elem_priv *elem_priv)
{
- struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
*nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL)
@@ -5895,15 +7021,46 @@ static bool nft_setelem_is_catchall(const struct nft_set *set,
}
static void nft_setelem_activate(struct net *net, struct nft_set *set,
- struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
- if (nft_setelem_is_catchall(set, elem)) {
- nft_set_elem_change_active(net, set, ext);
- nft_set_elem_clear_busy(ext);
+ if (nft_setelem_is_catchall(set, elem_priv)) {
+ nft_clear(net, ext);
} else {
- set->ops->activate(net, set, elem);
+ set->ops->activate(net, set, elem_priv);
+ }
+}
+
+static void nft_trans_elem_update(const struct nft_set *set,
+ const struct nft_trans_one_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_elem_update *update = elem->update;
+
+ if (update->flags & NFT_TRANS_UPD_TIMEOUT)
+ WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout);
+
+ if (update->flags & NFT_TRANS_UPD_EXPIRATION)
+ WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration);
+}
+
+static void nft_trans_elems_add(const struct nft_ctx *ctx,
+ struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ struct nft_trans_one_elem *elem = &te->elems[i];
+
+ if (elem->update)
+ nft_trans_elem_update(te->set, elem);
+ else
+ nft_setelem_activate(ctx->net, te->set, elem->priv);
+
+ nf_tables_setelem_notify(ctx, te->set, elem->priv,
+ NFT_MSG_NEWSETELEM);
+ kfree(elem->update);
}
}
@@ -5916,8 +7073,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net,
list_for_each_entry(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
- if (!nft_is_active(net, ext) ||
- nft_set_elem_mark_busy(ext))
+ if (!nft_is_active_next(net, ext))
continue;
kfree(elem->priv);
@@ -5960,16 +7116,21 @@ static int nft_setelem_deactivate(const struct net *net,
return ret;
}
+static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall)
+{
+ list_del_rcu(&catchall->list);
+ kfree_rcu(catchall, rcu);
+}
+
static void nft_setelem_catchall_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
struct nft_set_elem_catchall *catchall, *next;
list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
- if (catchall->elem == elem->priv) {
- list_del_rcu(&catchall->list);
- kfree_rcu(catchall, rcu);
+ if (catchall->elem == elem_priv) {
+ nft_setelem_catchall_destroy(catchall);
break;
}
}
@@ -5977,12 +7138,32 @@ static void nft_setelem_catchall_remove(const struct net *net,
static void nft_setelem_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- if (nft_setelem_is_catchall(set, elem))
- nft_setelem_catchall_remove(net, set, elem);
+ if (nft_setelem_is_catchall(set, elem_priv))
+ nft_setelem_catchall_remove(net, set, elem_priv);
else
- set->ops->remove(net, set, elem);
+ set->ops->remove(net, set, elem_priv);
+}
+
+static void nft_trans_elems_remove(const struct nft_ctx *ctx,
+ const struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ WARN_ON_ONCE(te->elems[i].update);
+
+ nf_tables_setelem_notify(ctx, te->set,
+ te->elems[i].priv,
+ te->nft_trans.msg_type);
+
+ nft_setelem_remove(ctx->net, te->set, te->elems[i].priv);
+ if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) {
+ atomic_dec(&te->set->nelems);
+ te->set->ndeact--;
+ }
+ }
}
static bool nft_setelem_valid_key_end(const struct nft_set *set,
@@ -6004,6 +7185,27 @@ static bool nft_setelem_valid_key_end(const struct nft_set *set,
return true;
}
+static u32 nft_set_maxsize(const struct nft_set *set)
+{
+ u32 maxsize, delta;
+
+ if (!set->size)
+ return UINT_MAX;
+
+ if (set->ops->adjust_maxsize)
+ delta = set->ops->adjust_maxsize(set);
+ else
+ delta = 0;
+
+ if (check_add_overflow(set->size, set->ndeact, &maxsize))
+ return UINT_MAX;
+
+ if (check_add_overflow(maxsize, delta, &maxsize))
+ return UINT_MAX;
+
+ return maxsize;
+}
+
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{
@@ -6015,13 +7217,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_ext *ext, *ext2;
struct nft_set_elem elem;
struct nft_set_binding *binding;
+ struct nft_elem_priv *elem_priv;
struct nft_object *obj = NULL;
struct nft_userdata *udata;
struct nft_data_desc desc;
enum nft_registers dreg;
struct nft_trans *trans;
- u64 timeout;
u64 expiration;
+ u64 timeout;
int err, i;
u8 ulen;
@@ -6036,7 +7239,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
return err;
- if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+ if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) ||
+ (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY]))
return -EINVAL;
if (flags != 0) {
@@ -6087,17 +7291,23 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return err;
} else if (set->flags & NFT_SET_TIMEOUT &&
!(flags & NFT_SET_ELEM_INTERVAL_END)) {
- timeout = READ_ONCE(set->timeout);
+ timeout = set->timeout;
}
expiration = 0;
if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
return -EINVAL;
+ if (timeout == 0)
+ return -EOPNOTSUPP;
+
err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION],
&expiration);
if (err)
return err;
+
+ if (expiration > timeout)
+ return -ERANGE;
}
if (nla[NFTA_SET_ELEM_EXPR]) {
@@ -6183,16 +7393,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
goto err_parse_key_end;
}
- if (timeout > 0) {
- err = nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
+ if (set->flags & NFT_SET_TIMEOUT) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
if (err < 0)
goto err_parse_key_end;
-
- if (timeout != READ_ONCE(set->timeout)) {
- err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
- if (err < 0)
- goto err_parse_key_end;
- }
}
if (num_exprs) {
@@ -6211,8 +7415,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
set->objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
+ obj = NULL;
+ goto err_parse_key_end;
+ }
+
+ if (!nft_use_inc(&obj->use)) {
+ err = -EMFILE;
+ obj = NULL;
goto err_parse_key_end;
}
+
err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
if (err < 0)
goto err_parse_key_end;
@@ -6245,7 +7457,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (desc.type == NFT_DATA_VERDICT &&
(elem.data.val.verdict.code == NFT_GOTO ||
elem.data.val.verdict.code == NFT_JUMP))
- nft_validate_state_update(ctx->net,
+ nft_validate_state_update(ctx->table,
NFT_VALIDATE_NEED);
}
@@ -6281,19 +7493,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (flags)
*nft_set_ext_flags(ext) = flags;
+ if (obj)
+ *nft_set_ext_obj(ext) = obj;
+
if (ulen > 0) {
if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) {
err = -EINVAL;
- goto err_elem_userdata;
+ goto err_elem_free;
}
udata = nft_set_ext_userdata(ext);
udata->len = ulen - 1;
nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
}
- if (obj) {
- *nft_set_ext_obj(ext) = obj;
- obj->use++;
- }
err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs);
if (err < 0)
goto err_elem_free;
@@ -6304,11 +7515,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
goto err_elem_free;
}
- ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
+ ext->genmask = nft_genmask_cur(ctx->net);
- err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags);
+ err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags);
if (err) {
if (err == -EEXIST) {
+ ext2 = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) ||
nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
@@ -6322,8 +7534,40 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
*nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
goto err_element_clash;
- else if (!(nlmsg_flags & NLM_F_EXCL))
+ else if (!(nlmsg_flags & NLM_F_EXCL)) {
err = 0;
+ if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) {
+ struct nft_elem_update update = { };
+
+ if (timeout != nft_set_ext_timeout(ext2)->timeout) {
+ update.timeout = timeout;
+ if (expiration == 0)
+ expiration = timeout;
+
+ update.flags |= NFT_TRANS_UPD_TIMEOUT;
+ }
+ if (expiration) {
+ update.expiration = expiration;
+ update.flags |= NFT_TRANS_UPD_EXPIRATION;
+ }
+
+ if (update.flags) {
+ struct nft_trans_one_elem *ue;
+
+ ue = &nft_trans_container_elem(trans)->elems[0];
+
+ ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL);
+ if (!ue->update) {
+ err = -ENOMEM;
+ goto err_element_clash;
+ }
+
+ ue->priv = elem_priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
+ goto err_elem_free;
+ }
+ }
+ }
} else if (err == -ENOTEMPTY) {
/* ENOTEMPTY reports overlapping between this element
* and an existing one.
@@ -6333,29 +7577,32 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
goto err_element_clash;
}
- if (!(flags & NFT_SET_ELEM_CATCHALL) && set->size &&
- !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) {
- err = -ENFILE;
- goto err_set_full;
+ if (!(flags & NFT_SET_ELEM_CATCHALL)) {
+ unsigned int max = nft_set_maxsize(set);
+
+ if (!atomic_add_unless(&set->nelems, 1, max)) {
+ err = -ENFILE;
+ goto err_set_full;
+ }
}
- nft_trans_elem(trans) = elem;
- nft_trans_commit_list_add_tail(ctx->net, trans);
+ nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
err_set_full:
- nft_setelem_remove(ctx->net, set, &elem);
+ nft_setelem_remove(ctx->net, set, elem.priv);
err_element_clash:
kfree(trans);
err_elem_free:
- if (obj)
- obj->use--;
-err_elem_userdata:
nf_tables_set_elem_destroy(ctx, set, elem.priv);
err_parse_data:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
nft_data_release(&elem.data.val, desc.type);
err_parse_key_end:
+ if (obj)
+ nft_use_dec_restore(&obj->use);
+
nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
err_parse_key:
nft_data_release(&elem.key.val, NFT_DATA_VALUE);
@@ -6370,7 +7617,6 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
const struct nfnl_info *info,
const struct nlattr * const nla[])
{
- struct nftables_pernet *nft_net = nft_pernet(info->net);
struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_next(info->net);
u8 family = info->nfmsg->nfgen_family;
@@ -6393,10 +7639,13 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET],
nla[NFTA_SET_ELEM_LIST_SET_ID], genmask);
- if (IS_ERR(set))
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
+ }
- if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ if (!list_empty(&set->bindings) &&
+ (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS)))
return -EBUSY;
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
@@ -6409,7 +7658,7 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
}
}
- if (nft_net->validate_state == NFT_VALIDATE_DO)
+ if (table->validate_state == NFT_VALIDATE_DO)
return nft_table_validate(net, table);
return 0;
@@ -6429,50 +7678,99 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
{
struct nft_chain *chain;
- struct nft_rule *rule;
if (type == NFT_DATA_VERDICT) {
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
chain = data->verdict.chain;
- chain->use++;
-
- if (!nft_chain_is_bound(chain))
- break;
-
- chain->table->use++;
- list_for_each_entry(rule, &chain->rules, list)
- chain->use++;
-
- nft_chain_add(chain->table, chain);
+ nft_use_inc_restore(&chain->use);
break;
}
}
}
+static int nft_setelem_active_next(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+ u8 genmask = nft_genmask_next(net);
+
+ return nft_set_elem_active(ext, genmask);
+}
+
static void nft_setelem_data_activate(const struct net *net,
const struct nft_set *set,
- struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_hold(nft_set_ext_data(ext), set->dtype);
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
- (*nft_set_ext_obj(ext))->use++;
+ nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use);
}
-static void nft_setelem_data_deactivate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem)
+void nft_setelem_data_deactivate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_release(nft_set_ext_data(ext), set->dtype);
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
- (*nft_set_ext_obj(ext))->use--;
+ nft_use_dec(&(*nft_set_ext_obj(ext))->use);
+}
+
+/* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem.
+ * No notifications and no ndeact changes.
+ *
+ * Returns true if set had been added to (i.e., elements need to be removed again).
+ */
+static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx,
+ struct nft_trans_elem *te)
+{
+ bool removed = false;
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ if (te->elems[i].update) {
+ kfree(te->elems[i].update);
+ te->elems[i].update = NULL;
+ /* Update request, so do not release this element */
+ te->elems[i].priv = NULL;
+ continue;
+ }
+
+ if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv))
+ nft_setelem_remove(ctx->net, te->set, te->elems[i].priv);
+
+ if (!nft_setelem_is_catchall(te->set, te->elems[i].priv))
+ atomic_dec(&te->set->nelems);
+
+ removed = true;
+ }
+
+ return removed;
+}
+
+/* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */
+static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx,
+ const struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) {
+ nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv);
+ nft_setelem_activate(ctx->net, te->set, te->elems[i].priv);
+ }
+
+ if (!nft_setelem_is_catchall(te->set, te->elems[i].priv))
+ te->set->ndeact--;
+ }
}
static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
@@ -6552,10 +7850,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
goto fail_ops;
- nft_setelem_data_deactivate(ctx->net, set, &elem);
+ nft_setelem_data_deactivate(ctx->net, set, elem.priv);
- nft_trans_elem(trans) = elem;
- nft_trans_commit_list_add_tail(ctx->net, trans);
+ nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
fail_ops:
@@ -6572,48 +7870,44 @@ fail_elem:
static int nft_setelem_flush(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
struct nft_trans *trans;
- int err;
- trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
- sizeof(struct nft_trans_elem), GFP_ATOMIC);
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM,
+ struct_size_t(struct nft_trans_elem, elems, 1));
if (!trans)
return -ENOMEM;
- if (!set->ops->flush(ctx->net, set, elem->priv)) {
- err = -ENOENT;
- goto err1;
- }
+ set->ops->flush(ctx->net, set, elem_priv);
set->ndeact++;
- nft_setelem_data_deactivate(ctx->net, set, elem);
+ nft_setelem_data_deactivate(ctx->net, set, elem_priv);
nft_trans_elem_set(trans) = set;
- nft_trans_elem(trans) = *elem;
- nft_trans_commit_list_add_tail(ctx->net, trans);
+ nft_trans_container_elem(trans)->nelems = 1;
+ nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
-err1:
- kfree(trans);
- return err;
}
static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
struct nft_set *set,
- struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
struct nft_trans *trans;
- trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
- sizeof(struct nft_trans_elem), GFP_KERNEL);
+ trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
if (!trans)
return -ENOMEM;
- nft_setelem_data_deactivate(ctx->net, set, elem);
- nft_trans_elem_set(trans) = set;
- nft_trans_elem(trans) = *elem;
- nft_trans_commit_list_add_tail(ctx->net, trans);
+ nft_setelem_data_deactivate(ctx->net, set, elem_priv);
+ nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
}
@@ -6623,20 +7917,19 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx,
{
u8 genmask = nft_genmask_next(ctx->net);
struct nft_set_elem_catchall *catchall;
- struct nft_set_elem elem;
struct nft_set_ext *ext;
int ret = 0;
- list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list,
+ lockdep_commit_lock_is_held(ctx->net)) {
ext = nft_set_elem_ext(set, catchall->elem);
- if (!nft_set_elem_active(ext, genmask) ||
- nft_set_elem_mark_busy(ext))
+ if (!nft_set_elem_active(ext, genmask))
continue;
- elem.priv = catchall->elem;
- ret = __nft_set_catchall_flush(ctx, set, &elem);
+ ret = __nft_set_catchall_flush(ctx, set, catchall->elem);
if (ret < 0)
break;
+ nft_set_elem_change_active(ctx->net, set, ext);
}
return ret;
@@ -6646,6 +7939,7 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
{
struct nft_set_iter iter = {
.genmask = genmask,
+ .type = NFT_ITER_UPDATE,
.fn = nft_setelem_flush,
};
@@ -6677,10 +7971,16 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
return PTR_ERR(table);
}
- set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
- if (IS_ERR(set))
+ set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
- if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ }
+
+ if (nft_set_is_anonymous(set))
+ return -EOPNOTSUPP;
+
+ if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT))
return -EBUSY;
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
@@ -6690,35 +7990,17 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_del_setelem(&ctx, set, attr);
+ if (err == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM)
+ continue;
+
if (err < 0) {
NL_SET_BAD_ATTR(extack, attr);
- break;
+ return err;
}
}
- return err;
-}
-
-void nft_set_gc_batch_release(struct rcu_head *rcu)
-{
- struct nft_set_gc_batch *gcb;
- unsigned int i;
-
- gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu);
- for (i = 0; i < gcb->head.cnt; i++)
- nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
- kfree(gcb);
-}
-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
- gfp_t gfp)
-{
- struct nft_set_gc_batch *gcb;
-
- gcb = kzalloc(sizeof(*gcb), gfp);
- if (gcb == NULL)
- return gcb;
- gcb->head.set = set;
- return gcb;
+ return 0;
}
/*
@@ -6889,11 +8171,15 @@ nla_put_failure:
return -1;
}
-static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
+static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family)
{
const struct nft_object_type *type;
- list_for_each_entry(type, &nf_tables_objects, list) {
+ list_for_each_entry_rcu(type, &nf_tables_objects, list) {
+ if (type->family != NFPROTO_UNSPEC &&
+ type->family != family)
+ continue;
+
if (objtype == type->type)
return type;
}
@@ -6901,13 +8187,17 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
}
static const struct nft_object_type *
-nft_obj_type_get(struct net *net, u32 objtype)
+nft_obj_type_get(struct net *net, u32 objtype, u8 family)
{
const struct nft_object_type *type;
- type = __nft_obj_type_get(objtype);
- if (type != NULL && try_module_get(type->owner))
+ rcu_read_lock();
+ type = __nft_obj_type_get(objtype, family);
+ if (type != NULL && try_module_get(type->owner)) {
+ rcu_read_unlock();
return type;
+ }
+ rcu_read_unlock();
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
@@ -6928,9 +8218,7 @@ static int nf_tables_updobj(const struct nft_ctx *ctx,
struct nft_trans *trans;
int err = -ENOMEM;
- if (!try_module_get(type->owner))
- return -ENOENT;
-
+ /* caller must have obtained type->owner reference. */
trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ,
sizeof(struct nft_trans_obj));
if (!trans)
@@ -6998,17 +8286,29 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- type = __nft_obj_type_get(objtype);
+ if (!obj->ops->update)
+ return 0;
+
+ type = nft_obj_type_get(net, objtype, family);
+ if (WARN_ON_ONCE(IS_ERR(type)))
+ return PTR_ERR(type);
+
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+ /* type->owner reference is put when transaction object is released. */
return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj);
}
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- type = nft_obj_type_get(net, objtype);
- if (IS_ERR(type))
- return PTR_ERR(type);
+ if (!nft_use_inc(&table->use))
+ return -EMFILE;
+
+ type = nft_obj_type_get(net, objtype, family);
+ if (IS_ERR(type)) {
+ err = PTR_ERR(type);
+ goto err_type;
+ }
obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]);
if (IS_ERR(obj)) {
@@ -7025,7 +8325,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
}
if (nla[NFTA_OBJ_USERDATA]) {
- obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL);
+ obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT);
if (obj->udata == NULL)
goto err_userdata;
@@ -7042,7 +8342,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
goto err_obj_ht;
list_add_tail_rcu(&obj->list, &table->objects);
- table->use++;
+
return 0;
err_obj_ht:
/* queued in transaction log */
@@ -7058,6 +8358,9 @@ err_strdup:
kfree(obj);
err_init:
module_put(type->owner);
+err_type:
+ nft_use_dec_restore(&table->use);
+
return err;
}
@@ -7068,21 +8371,29 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
{
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
- NFNETLINK_V0, nft_base_seq(net));
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) ||
nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
- nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
- nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) ||
nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle),
NFTA_OBJ_PAD))
goto nla_put_failure;
+ if (event == NFT_MSG_DELOBJ ||
+ event == NFT_MSG_DESTROYOBJ) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
+ if (nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
+ nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
+ goto nla_put_failure;
+
if (obj->udata &&
nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata))
goto nla_put_failure;
@@ -7095,124 +8406,136 @@ nla_put_failure:
return -1;
}
-struct nft_obj_filter {
+static void audit_log_obj_reset(const struct nft_table *table,
+ unsigned int base_seq, unsigned int nentries)
+{
+ char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);
+
+ audit_log_nfcfg(buf, table->family, nentries,
+ AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
+ kfree(buf);
+}
+
+struct nft_obj_dump_ctx {
+ unsigned int s_idx;
char *table;
u32 type;
+ bool reset;
};
static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- const struct nft_table *table;
- unsigned int idx = 0, s_idx = cb->args[0];
- struct nft_obj_filter *filter = cb->data;
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nftables_pernet *nft_net;
+ const struct nft_table *table;
+ unsigned int entries = 0;
struct nft_object *obj;
- bool reset = false;
-
- if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
- reset = true;
+ unsigned int idx = 0;
+ int rc = 0;
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
+ entries = 0;
list_for_each_entry_rcu(obj, &table->objects, list) {
if (!nft_is_active(net, obj))
goto cont;
- if (idx < s_idx)
+ if (idx < ctx->s_idx)
goto cont;
- if (idx > s_idx)
- memset(&cb->args[1], 0,
- sizeof(cb->args) - sizeof(cb->args[0]));
- if (filter && filter->table &&
- strcmp(filter->table, table->name))
+ if (ctx->table && strcmp(ctx->table, table->name))
goto cont;
- if (filter &&
- filter->type != NFT_OBJECT_UNSPEC &&
- obj->ops->type->type != filter->type)
+ if (ctx->type != NFT_OBJECT_UNSPEC &&
+ obj->ops->type->type != ctx->type)
goto cont;
- if (reset) {
- char *buf = kasprintf(GFP_ATOMIC,
- "%s:%u",
- table->name,
- nft_net->base_seq);
-
- audit_log_nfcfg(buf,
- family,
- obj->handle,
- AUDIT_NFT_OP_OBJ_RESET,
- GFP_ATOMIC);
- kfree(buf);
- }
- if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NFT_MSG_NEWOBJ,
- NLM_F_MULTI | NLM_F_APPEND,
- table->family, table,
- obj, reset) < 0)
- goto done;
+ rc = nf_tables_fill_obj_info(skb, net,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWOBJ,
+ NLM_F_MULTI | NLM_F_APPEND,
+ table->family, table,
+ obj, ctx->reset);
+ if (rc < 0)
+ break;
+ entries++;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
idx++;
}
+ if (ctx->reset && entries)
+ audit_log_obj_reset(table, nft_base_seq(net), entries);
+ if (rc < 0)
+ break;
}
-done:
rcu_read_unlock();
- cb->args[0] = idx;
+ ctx->s_idx = idx;
return skb->len;
}
+static int nf_tables_dumpreset_obj(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+ int ret;
+
+ mutex_lock(&nft_net->commit_mutex);
+ ret = nf_tables_dump_obj(skb, cb);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return ret;
+}
+
static int nf_tables_dump_obj_start(struct netlink_callback *cb)
{
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
const struct nlattr * const *nla = cb->data;
- struct nft_obj_filter *filter = NULL;
- if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) {
- filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
- if (!filter)
- return -ENOMEM;
-
- if (nla[NFTA_OBJ_TABLE]) {
- filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
- if (!filter->table) {
- kfree(filter);
- return -ENOMEM;
- }
- }
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
- if (nla[NFTA_OBJ_TYPE])
- filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ if (nla[NFTA_OBJ_TABLE]) {
+ ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
+ if (!ctx->table)
+ return -ENOMEM;
}
- cb->data = filter;
+ if (nla[NFTA_OBJ_TYPE])
+ ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+
return 0;
}
+static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb)
+{
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
+
+ ctx->reset = true;
+
+ return nf_tables_dump_obj_start(cb);
+}
+
static int nf_tables_dump_obj_done(struct netlink_callback *cb)
{
- struct nft_obj_filter *filter = cb->data;
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
- if (filter) {
- kfree(filter->table);
- kfree(filter);
- }
+ kfree(ctx->table);
return 0;
}
-/* called with rcu_read_lock held */
-static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
- const struct nlattr * const nla[])
+/* Caller must hold rcu read lock or transaction mutex */
+static struct sk_buff *
+nf_tables_getobj_single(u32 portid, const struct nfnl_info *info,
+ const struct nlattr * const nla[], bool reset)
{
struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_cur(info->net);
@@ -7221,72 +8544,109 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
struct net *net = info->net;
struct nft_object *obj;
struct sk_buff *skb2;
- bool reset = false;
u32 objtype;
int err;
- if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
- struct netlink_dump_control c = {
- .start = nf_tables_dump_obj_start,
- .dump = nf_tables_dump_obj,
- .done = nf_tables_dump_obj_done,
- .module = THIS_MODULE,
- .data = (void *)nla,
- };
-
- return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
- }
-
if (!nla[NFTA_OBJ_NAME] ||
!nla[NFTA_OBJ_TYPE])
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
- return PTR_ERR(table);
+ return ERR_CAST(table);
}
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask);
if (IS_ERR(obj)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
- return PTR_ERR(obj);
+ return ERR_CAST(obj);
}
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
+
+ err = nf_tables_fill_obj_info(skb2, net, portid,
+ info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+ family, table, obj, reset);
+ if (err < 0) {
+ kfree_skb(skb2);
+ return ERR_PTR(err);
+ }
- if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
- reset = true;
+ return skb2;
+}
- if (reset) {
- const struct nftables_pernet *nft_net;
- char *buf;
+static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ u32 portid = NETLINK_CB(skb).portid;
+ struct sk_buff *skb2;
- nft_net = nft_pernet(net);
- buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq);
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dump_obj_start,
+ .dump = nf_tables_dump_obj,
+ .done = nf_tables_dump_obj_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
- audit_log_nfcfg(buf,
- family,
- obj->handle,
- AUDIT_NFT_OP_OBJ_RESET,
- GFP_ATOMIC);
- kfree(buf);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
- err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
- info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
- family, table, obj, reset);
- if (err < 0)
- goto err_fill_obj_info;
+ skb2 = nf_tables_getobj_single(portid, info, nla, false);
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
- return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, info->net, portid);
+}
-err_fill_obj_info:
- kfree_skb(skb2);
- return err;
+static int nf_tables_getobj_reset(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net *net = info->net;
+ struct sk_buff *skb2;
+ char *buf;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dumpreset_obj_start,
+ .dump = nf_tables_dumpreset_obj,
+ .done = nf_tables_dump_obj_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+ rcu_read_unlock();
+ mutex_lock(&nft_net->commit_mutex);
+ skb2 = nf_tables_getobj_single(portid, info, nla, true);
+ mutex_unlock(&nft_net->commit_mutex);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
+ nla_len(nla[NFTA_OBJ_TABLE]),
+ (char *)nla_data(nla[NFTA_OBJ_TABLE]),
+ nft_base_seq(net));
+ audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
+ AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
+ kfree(buf);
+
+ return nfnetlink_unicast(skb2, net, portid);
}
static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
@@ -7334,6 +8694,10 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info,
}
if (IS_ERR(obj)) {
+ if (PTR_ERR(obj) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(obj);
}
@@ -7347,24 +8711,14 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info,
return nft_delobj(&ctx, obj);
}
-void nft_obj_notify(struct net *net, const struct nft_table *table,
- struct nft_object *obj, u32 portid, u32 seq, int event,
- u16 flags, int family, int report, gfp_t gfp)
+static void
+__nft_obj_notify(struct net *net, const struct nft_table *table,
+ struct nft_object *obj, u32 portid, u32 seq, int event,
+ u16 flags, int family, int report, gfp_t gfp)
{
struct nftables_pernet *nft_net = nft_pernet(net);
struct sk_buff *skb;
int err;
- char *buf = kasprintf(gfp, "%s:%u",
- table->name, nft_net->base_seq);
-
- audit_log_nfcfg(buf,
- family,
- obj->handle,
- event == NFT_MSG_NEWOBJ ?
- AUDIT_NFT_OP_OBJ_REGISTER :
- AUDIT_NFT_OP_OBJ_UNREGISTER,
- gfp);
- kfree(buf);
if (!report &&
!nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
@@ -7387,13 +8741,34 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
err:
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
+
+void nft_obj_notify(struct net *net, const struct nft_table *table,
+ struct nft_object *obj, u32 portid, u32 seq, int event,
+ u16 flags, int family, int report, gfp_t gfp)
+{
+ char *buf = kasprintf(gfp, "%s:%u",
+ table->name, nft_base_seq(net));
+
+ audit_log_nfcfg(buf,
+ family,
+ obj->handle,
+ event == NFT_MSG_NEWOBJ ?
+ AUDIT_NFT_OP_OBJ_REGISTER :
+ AUDIT_NFT_OP_OBJ_UNREGISTER,
+ gfp);
+ kfree(buf);
+
+ __nft_obj_notify(net, table, obj, portid, seq, event,
+ flags, family, report, gfp);
+}
EXPORT_SYMBOL_GPL(nft_obj_notify);
static void nf_tables_obj_notify(const struct nft_ctx *ctx,
struct nft_object *obj, int event)
{
- nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event,
- ctx->flags, ctx->family, ctx->report, GFP_KERNEL);
+ __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid,
+ ctx->seq, event, ctx->flags, ctx->family,
+ ctx->report, GFP_KERNEL);
}
/*
@@ -7425,12 +8800,14 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
[NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 },
};
-struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
+struct nft_flowtable *nft_flowtable_lookup(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
struct nft_flowtable *flowtable;
- list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+ list_for_each_entry_rcu(flowtable, &table->flowtables, list,
+ lockdep_commit_lock_is_held(net)) {
if (!nla_strcmp(nla, flowtable->name) &&
nft_active_genmask(flowtable, genmask))
return flowtable;
@@ -7444,10 +8821,11 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
enum nft_trans_phase phase)
{
switch (phase) {
+ case NFT_TRANS_PREPARE_ERROR:
case NFT_TRANS_PREPARE:
case NFT_TRANS_ABORT:
case NFT_TRANS_RELEASE:
- flowtable->use--;
+ nft_use_dec(&flowtable->use);
fallthrough;
default:
return;
@@ -7482,26 +8860,31 @@ static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX
};
static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
- const struct nlattr *attr,
+ const struct nlattr * const nla[],
struct nft_flowtable_hook *flowtable_hook,
- struct nft_flowtable *flowtable, bool add)
+ struct nft_flowtable *flowtable,
+ struct netlink_ext_ack *extack, bool add)
{
struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
int hooknum, priority;
int err;
INIT_LIST_HEAD(&flowtable_hook->list);
- err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr,
+ err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX,
+ nla[NFTA_FLOWTABLE_HOOK],
nft_flowtable_hook_policy, NULL);
if (err < 0)
return err;
if (add) {
if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
- !tb[NFTA_FLOWTABLE_HOOK_PRIORITY])
- return -EINVAL;
+ !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
+ return -ENOENT;
+ }
hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
if (hooknum != NF_NETDEV_INGRESS)
@@ -7531,27 +8914,32 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) {
err = nf_tables_parse_netdev_hooks(ctx->net,
tb[NFTA_FLOWTABLE_HOOK_DEVS],
- &flowtable_hook->list);
+ &flowtable_hook->list,
+ extack);
if (err < 0)
return err;
}
list_for_each_entry(hook, &flowtable_hook->list, list) {
- hook->ops.pf = NFPROTO_NETDEV;
- hook->ops.hooknum = flowtable_hook->num;
- hook->ops.priority = flowtable_hook->priority;
- hook->ops.priv = &flowtable->data;
- hook->ops.hook = flowtable->data.type->hook;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ ops->pf = NFPROTO_NETDEV;
+ ops->hooknum = flowtable_hook->num;
+ ops->priority = flowtable_hook->priority;
+ ops->priv = &flowtable->data;
+ ops->hook = flowtable->data.type->hook;
+ ops->hook_ops_type = NF_HOOK_OP_NFT_FT;
+ }
}
return err;
}
+/* call under rcu_read_lock */
static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
{
const struct nf_flowtable_type *type;
- list_for_each_entry(type, &nf_tables_flowtables, list) {
+ list_for_each_entry_rcu(type, &nf_tables_flowtables, list) {
if (family == type->family)
return type;
}
@@ -7563,9 +8951,13 @@ nft_flowtable_type_get(struct net *net, u8 family)
{
const struct nf_flowtable_type *type;
+ rcu_read_lock();
type = __nft_flowtable_type_get(family);
- if (type != NULL && try_module_get(type->owner))
+ if (type != NULL && try_module_get(type->owner)) {
+ rcu_read_unlock();
return type;
+ }
+ rcu_read_unlock();
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
@@ -7578,34 +8970,58 @@ nft_flowtable_type_get(struct net *net, u8 family)
}
/* Only called from error and netdev event paths. */
-static void nft_unregister_flowtable_hook(struct net *net,
- struct nft_flowtable *flowtable,
- struct nft_hook *hook)
+static void nft_unregister_flowtable_ops(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct nf_hook_ops *ops)
{
- nf_unregister_net_hook(net, &hook->ops);
- flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
+ nf_unregister_net_hook(net, ops);
+ flowtable->data.type->setup(&flowtable->data, ops->dev,
FLOW_BLOCK_UNBIND);
}
static void __nft_unregister_flowtable_net_hooks(struct net *net,
+ struct nft_flowtable *flowtable,
struct list_head *hook_list,
bool release_netdev)
{
struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
list_for_each_entry_safe(hook, next, hook_list, list) {
- nf_unregister_net_hook(net, &hook->ops);
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nft_unregister_flowtable_ops(net, flowtable, ops);
if (release_netdev) {
list_del(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
}
}
static void nft_unregister_flowtable_net_hooks(struct net *net,
+ struct nft_flowtable *flowtable,
struct list_head *hook_list)
{
- __nft_unregister_flowtable_net_hooks(net, hook_list, false);
+ __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false);
+}
+
+static int nft_register_flowtable_ops(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct nf_hook_ops *ops)
+{
+ int err;
+
+ err = flowtable->data.type->setup(&flowtable->data,
+ ops->dev, FLOW_BLOCK_BIND);
+ if (err < 0)
+ return err;
+
+ err = nf_register_net_hook(net, ops);
+ if (!err)
+ return 0;
+
+ flowtable->data.type->setup(&flowtable->data,
+ ops->dev, FLOW_BLOCK_UNBIND);
+ return err;
}
static int nft_register_flowtable_net_hooks(struct net *net,
@@ -7613,8 +9029,9 @@ static int nft_register_flowtable_net_hooks(struct net *net,
struct list_head *hook_list,
struct nft_flowtable *flowtable)
{
- struct nft_hook *hook, *hook2, *next;
+ struct nft_hook *hook, *next;
struct nft_flowtable *ft;
+ struct nf_hook_ops *ops;
int err, i = 0;
list_for_each_entry(hook, hook_list, list) {
@@ -7622,77 +9039,85 @@ static int nft_register_flowtable_net_hooks(struct net *net,
if (!nft_is_active_next(net, ft))
continue;
- list_for_each_entry(hook2, &ft->hook_list, list) {
- if (hook->ops.dev == hook2->ops.dev &&
- hook->ops.pf == hook2->ops.pf) {
- err = -EEXIST;
- goto err_unregister_net_hooks;
- }
+ if (nft_hook_list_find(&ft->hook_list, hook)) {
+ err = -EEXIST;
+ goto err_unregister_net_hooks;
}
}
- err = flowtable->data.type->setup(&flowtable->data,
- hook->ops.dev,
- FLOW_BLOCK_BIND);
- if (err < 0)
- goto err_unregister_net_hooks;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ err = nft_register_flowtable_ops(net, flowtable, ops);
+ if (err < 0)
+ goto err_unregister_net_hooks;
- err = nf_register_net_hook(net, &hook->ops);
- if (err < 0) {
- flowtable->data.type->setup(&flowtable->data,
- hook->ops.dev,
- FLOW_BLOCK_UNBIND);
- goto err_unregister_net_hooks;
+ i++;
}
-
- i++;
}
return 0;
err_unregister_net_hooks:
list_for_each_entry_safe(hook, next, hook_list, list) {
- if (i-- <= 0)
- break;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (i-- <= 0)
+ break;
- nft_unregister_flowtable_hook(net, flowtable, hook);
+ nft_unregister_flowtable_ops(net, flowtable, ops);
+ }
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
return err;
}
-static void nft_flowtable_hooks_destroy(struct list_head *hook_list)
+static void nft_hooks_destroy(struct list_head *hook_list)
{
struct nft_hook *hook, *next;
list_for_each_entry_safe(hook, next, hook_list, list) {
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
}
static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
- struct nft_flowtable *flowtable)
+ struct nft_flowtable *flowtable,
+ struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_flowtable_hook flowtable_hook;
+ struct nftables_pernet *nft_net;
struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
struct nft_trans *trans;
bool unregister = false;
u32 flags;
int err;
- err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
- &flowtable_hook, flowtable, false);
+ err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable,
+ extack, false);
if (err < 0)
return err;
list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
if (nft_hook_list_find(&flowtable->hook_list, hook)) {
list_del(&hook->list);
- kfree(hook);
+ nft_netdev_hook_free(hook);
+ continue;
+ }
+
+ nft_net = nft_pernet(ctx->net);
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->msg_type != NFT_MSG_NEWFLOWTABLE ||
+ trans->table != ctx->table ||
+ !nft_trans_flowtable_update(trans))
+ continue;
+
+ if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) {
+ err = -EEXIST;
+ goto err_flowtable_update_hook;
+ }
}
}
@@ -7736,10 +9161,13 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
err_flowtable_update_hook:
list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
- if (unregister)
- nft_unregister_flowtable_hook(ctx->net, flowtable, hook);
+ if (unregister) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nft_unregister_flowtable_ops(ctx->net,
+ flowtable, ops);
+ }
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
return err;
@@ -7756,9 +9184,9 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
u8 family = info->nfmsg->nfgen_family;
const struct nf_flowtable_type *type;
struct nft_flowtable *flowtable;
- struct nft_hook *hook, *next;
struct net *net = info->net;
struct nft_table *table;
+ struct nft_trans *trans;
struct nft_ctx ctx;
int err;
@@ -7774,7 +9202,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
return PTR_ERR(table);
}
- flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+ flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME],
genmask);
if (IS_ERR(flowtable)) {
err = PTR_ERR(flowtable);
@@ -7790,14 +9218,19 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- return nft_flowtable_update(&ctx, info->nlh, flowtable);
+ return nft_flowtable_update(&ctx, info->nlh, flowtable, extack);
}
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+ if (!nft_use_inc(&table->use))
+ return -EMFILE;
+
flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT);
- if (!flowtable)
- return -ENOMEM;
+ if (!flowtable) {
+ err = -ENOMEM;
+ goto flowtable_alloc;
+ }
flowtable->table = table;
flowtable->handle = nf_tables_alloc_handle(table);
@@ -7830,38 +9263,37 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
if (err < 0)
goto err3;
- err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
- &flowtable_hook, flowtable, true);
+ err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable,
+ extack, true);
if (err < 0)
- goto err4;
+ goto err_flowtable_parse_hooks;
list_splice(&flowtable_hook.list, &flowtable->hook_list);
flowtable->data.priority = flowtable_hook.priority;
flowtable->hooknum = flowtable_hook.num;
+ trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto err_flowtable_trans;
+ }
+
+ /* This must be LAST to ensure no packets are walking over this flowtable. */
err = nft_register_flowtable_net_hooks(ctx.net, table,
&flowtable->hook_list,
flowtable);
- if (err < 0) {
- nft_flowtable_hooks_destroy(&flowtable->hook_list);
- goto err4;
- }
-
- err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
if (err < 0)
- goto err5;
+ goto err_flowtable_hooks;
list_add_tail_rcu(&flowtable->list, &table->flowtables);
- table->use++;
return 0;
-err5:
- list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
- nft_unregister_flowtable_hook(net, flowtable, hook);
- list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
- }
-err4:
+
+err_flowtable_hooks:
+ nft_trans_destroy(trans);
+err_flowtable_trans:
+ nft_hooks_destroy(&flowtable->hook_list);
+err_flowtable_parse_hooks:
flowtable->data.type->free(&flowtable->data);
err3:
module_put(type->owner);
@@ -7869,6 +9301,9 @@ err2:
kfree(flowtable->name);
err1:
kfree(flowtable);
+flowtable_alloc:
+ nft_use_dec_restore(&table->use);
+
return err;
}
@@ -7878,12 +9313,13 @@ static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook
list_for_each_entry_safe(this, next, &flowtable_hook->list, list) {
list_del(&this->list);
- kfree(this);
+ nft_netdev_hook_free(this);
}
}
static int nft_delflowtable_hook(struct nft_ctx *ctx,
- struct nft_flowtable *flowtable)
+ struct nft_flowtable *flowtable,
+ struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_flowtable_hook flowtable_hook;
@@ -7892,8 +9328,8 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx,
struct nft_trans *trans;
int err;
- err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
- &flowtable_hook, flowtable, false);
+ err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable,
+ extack, false);
if (err < 0)
return err;
@@ -7960,10 +9396,14 @@ static int nf_tables_delflowtable(struct sk_buff *skb,
flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask);
} else {
attr = nla[NFTA_FLOWTABLE_NAME];
- flowtable = nft_flowtable_lookup(table, attr, genmask);
+ flowtable = nft_flowtable_lookup(net, table, attr, genmask);
}
if (IS_ERR(flowtable)) {
+ if (PTR_ERR(flowtable) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(flowtable);
}
@@ -7971,7 +9411,7 @@ static int nf_tables_delflowtable(struct sk_buff *skb,
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
if (nla[NFTA_FLOWTABLE_HOOK])
- return nft_delflowtable_hook(&ctx, flowtable);
+ return nft_delflowtable_hook(&ctx, flowtable, extack);
if (flowtable->use > 0) {
NL_SET_BAD_ATTR(extack, attr);
@@ -7991,17 +9431,26 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
struct nft_hook *hook;
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
- NFNETLINK_V0, nft_base_seq(net));
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
- nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
- NFTA_FLOWTABLE_PAD) ||
+ NFTA_FLOWTABLE_PAD))
+ goto nla_put_failure;
+
+ if (!hook_list &&
+ (event == NFT_MSG_DELFLOWTABLE ||
+ event == NFT_MSG_DESTROYFLOWTABLE)) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
+ if (nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
goto nla_put_failure;
@@ -8016,8 +9465,12 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
if (!nest_devs)
goto nla_put_failure;
- list_for_each_entry_rcu(hook, hook_list, list) {
- if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name))
+ if (!hook_list)
+ hook_list = &flowtable->hook_list;
+
+ list_for_each_entry_rcu(hook, hook_list, list,
+ lockdep_commit_lock_is_held(net)) {
+ if (nft_nla_put_hook_dev(skb, hook))
goto nla_put_failure;
}
nla_nest_end(skb, nest_devs);
@@ -8049,7 +9502,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -8072,8 +9525,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
NFT_MSG_NEWFLOWTABLE,
NLM_F_MULTI | NLM_F_APPEND,
table->family,
- flowtable,
- &flowtable->hook_list) < 0)
+ flowtable, NULL) < 0)
goto done;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -8128,6 +9580,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb,
const struct nfnl_info *info,
const struct nlattr * const nla[])
{
+ struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_cur(info->net);
u8 family = info->nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
@@ -8153,13 +9606,17 @@ static int nf_tables_getflowtable(struct sk_buff *skb,
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
genmask, 0);
- if (IS_ERR(table))
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
+ }
- flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+ flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME],
genmask);
- if (IS_ERR(flowtable))
+ if (IS_ERR(flowtable)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return PTR_ERR(flowtable);
+ }
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
@@ -8168,7 +9625,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb,
err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
info->nlh->nlmsg_seq,
NFT_MSG_NEWFLOWTABLE, 0, family,
- flowtable, &flowtable->hook_list);
+ flowtable, NULL);
if (err < 0)
goto err_fill_flowtable_info;
@@ -8181,8 +9638,7 @@ err_fill_flowtable_info:
static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
struct nft_flowtable *flowtable,
- struct list_head *hook_list,
- int event)
+ struct list_head *hook_list, int event)
{
struct nftables_pernet *nft_net = nft_pernet(ctx->net);
struct sk_buff *skb;
@@ -8220,10 +9676,8 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
flowtable->data.type->free(&flowtable->data);
list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
- flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
- FLOW_BLOCK_UNBIND);
list_del_rcu(&hook->list);
- kfree(hook);
+ nft_netdev_hook_free_rcu(hook);
}
kfree(flowtable->name);
module_put(flowtable->data.type->owner);
@@ -8233,17 +9687,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq)
{
- struct nftables_pernet *nft_net = nft_pernet(net);
struct nlmsghdr *nlh;
char buf[TASK_COMM_LEN];
int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);
nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC,
- NFNETLINK_V0, nft_base_seq(net));
+ NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) ||
+ if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) ||
nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
goto nla_put_failure;
@@ -8256,46 +9709,132 @@ nla_put_failure:
return -EMSGSIZE;
}
-static void nft_flowtable_event(unsigned long event, struct net_device *dev,
- struct nft_flowtable *flowtable)
+struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook,
+ const struct net_device *dev)
+{
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (ops->dev == dev)
+ return ops;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_hook_find_ops);
+
+struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook,
+ const struct net_device *dev)
{
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry_rcu(ops, &hook->ops_list, list) {
+ if (ops->dev == dev)
+ return ops;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu);
+
+static int nft_flowtable_event(unsigned long event, struct net_device *dev,
+ struct nft_flowtable *flowtable, bool changename)
+{
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
+ bool match;
list_for_each_entry(hook, &flowtable->hook_list, list) {
- if (hook->ops.dev != dev)
- continue;
+ ops = nft_hook_find_ops(hook, dev);
+ match = !strncmp(hook->ifname, dev->name, hook->ifnamelen);
- /* flow_offload_netdev_event() cleans up entries for us. */
- nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook);
- list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* NOP if not found or new name still matching */
+ if (!ops || (changename && match))
+ continue;
+
+ /* flow_offload_netdev_event() cleans up entries for us. */
+ nft_unregister_flowtable_ops(dev_net(dev),
+ flowtable, ops);
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+ break;
+ case NETDEV_REGISTER:
+ /* NOP if not matching or already registered */
+ if (!match || (changename && ops))
+ continue;
+
+ ops = kzalloc(sizeof(struct nf_hook_ops),
+ GFP_KERNEL_ACCOUNT);
+ if (!ops)
+ return 1;
+
+ ops->pf = NFPROTO_NETDEV;
+ ops->hooknum = flowtable->hooknum;
+ ops->priority = flowtable->data.priority;
+ ops->priv = &flowtable->data;
+ ops->hook = flowtable->data.type->hook;
+ ops->hook_ops_type = NF_HOOK_OP_NFT_FT;
+ ops->dev = dev;
+ if (nft_register_flowtable_ops(dev_net(dev),
+ flowtable, ops)) {
+ kfree(ops);
+ return 1;
+ }
+ list_add_tail_rcu(&ops->list, &hook->ops_list);
+ break;
+ }
break;
}
+ return 0;
+}
+
+static int __nf_tables_flowtable_event(unsigned long event,
+ struct net_device *dev,
+ bool changename)
+{
+ struct nftables_pernet *nft_net = nft_pernet(dev_net(dev));
+ struct nft_flowtable *flowtable;
+ struct nft_table *table;
+
+ list_for_each_entry(table, &nft_net->tables, list) {
+ list_for_each_entry(flowtable, &table->flowtables, list) {
+ if (nft_flowtable_event(event, dev,
+ flowtable, changename))
+ return 1;
+ }
+ }
+ return 0;
}
static int nf_tables_flowtable_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct nft_flowtable *flowtable;
struct nftables_pernet *nft_net;
- struct nft_table *table;
+ int ret = NOTIFY_DONE;
struct net *net;
- if (event != NETDEV_UNREGISTER)
- return 0;
+ if (event != NETDEV_REGISTER &&
+ event != NETDEV_UNREGISTER &&
+ event != NETDEV_CHANGENAME)
+ return NOTIFY_DONE;
net = dev_net(dev);
nft_net = nft_pernet(net);
mutex_lock(&nft_net->commit_mutex);
- list_for_each_entry(table, &nft_net->tables, list) {
- list_for_each_entry(flowtable, &table->flowtables, list) {
- nft_flowtable_event(event, dev, flowtable);
+
+ if (event == NETDEV_CHANGENAME) {
+ if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) {
+ ret = NOTIFY_BAD;
+ goto out_unlock;
}
+ __nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true);
+ } else if (__nf_tables_flowtable_event(event, dev, false)) {
+ ret = NOTIFY_BAD;
}
+out_unlock:
mutex_unlock(&nft_net->commit_mutex);
-
- return NOTIFY_DONE;
+ return ret;
}
static struct notifier_block nf_tables_flowtable_notifier = {
@@ -8373,6 +9912,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
+ [NFT_MSG_DESTROYTABLE] = {
+ .call = nf_tables_deltable,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
[NFT_MSG_NEWCHAIN] = {
.call = nf_tables_newchain,
.type = NFNL_CB_BATCH,
@@ -8391,6 +9936,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
+ [NFT_MSG_DESTROYCHAIN] = {
+ .call = nf_tables_delchain,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
[NFT_MSG_NEWRULE] = {
.call = nf_tables_newrule,
.type = NFNL_CB_BATCH,
@@ -8404,7 +9955,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_rule_policy,
},
[NFT_MSG_GETRULE_RESET] = {
- .call = nf_tables_getrule,
+ .call = nf_tables_getrule_reset,
.type = NFNL_CB_RCU,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
@@ -8415,6 +9966,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
+ [NFT_MSG_DESTROYRULE] = {
+ .call = nf_tables_delrule,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
[NFT_MSG_NEWSET] = {
.call = nf_tables_newset,
.type = NFNL_CB_BATCH,
@@ -8433,6 +9990,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
+ [NFT_MSG_DESTROYSET] = {
+ .call = nf_tables_delset,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
[NFT_MSG_NEWSETELEM] = {
.call = nf_tables_newsetelem,
.type = NFNL_CB_BATCH,
@@ -8445,12 +10008,24 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
+ [NFT_MSG_GETSETELEM_RESET] = {
+ .call = nf_tables_getsetelem_reset,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
[NFT_MSG_DELSETELEM] = {
.call = nf_tables_delsetelem,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
+ [NFT_MSG_DESTROYSETELEM] = {
+ .call = nf_tables_delsetelem,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
[NFT_MSG_GETGEN] = {
.call = nf_tables_getgen,
.type = NFNL_CB_RCU,
@@ -8473,8 +10048,14 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
+ [NFT_MSG_DESTROYOBJ] = {
+ .call = nf_tables_delobj,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
[NFT_MSG_GETOBJ_RESET] = {
- .call = nf_tables_getobj,
+ .call = nf_tables_getobj_reset,
.type = NFNL_CB_RCU,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
@@ -8497,6 +10078,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
+ [NFT_MSG_DESTROYFLOWTABLE] = {
+ .call = nf_tables_delflowtable,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_FLOWTABLE_MAX,
+ .policy = nft_flowtable_policy,
+ },
};
static int nf_tables_validate(struct net *net)
@@ -8504,18 +10091,20 @@ static int nf_tables_validate(struct net *net)
struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_table *table;
- switch (nft_net->validate_state) {
- case NFT_VALIDATE_SKIP:
- break;
- case NFT_VALIDATE_NEED:
- nft_validate_state_update(net, NFT_VALIDATE_DO);
- fallthrough;
- case NFT_VALIDATE_DO:
- list_for_each_entry(table, &nft_net->tables, list) {
+ list_for_each_entry(table, &nft_net->tables, list) {
+ switch (table->validate_state) {
+ case NFT_VALIDATE_SKIP:
+ continue;
+ case NFT_VALIDATE_NEED:
+ nft_validate_state_update(table, NFT_VALIDATE_DO);
+ fallthrough;
+ case NFT_VALIDATE_DO:
if (nft_table_validate(net, table) < 0)
return -EAGAIN;
+
+ nft_validate_state_update(table, NFT_VALIDATE_SKIP);
+ break;
}
- break;
}
return 0;
@@ -8528,51 +10117,53 @@ static int nf_tables_validate(struct net *net)
*
* We defer the drop policy until the transaction has been finalized.
*/
-static void nft_chain_commit_drop_policy(struct nft_trans *trans)
+static void nft_chain_commit_drop_policy(struct nft_trans_chain *trans)
{
struct nft_base_chain *basechain;
- if (nft_trans_chain_policy(trans) != NF_DROP)
+ if (trans->policy != NF_DROP)
return;
- if (!nft_is_base_chain(trans->ctx.chain))
+ if (!nft_is_base_chain(trans->chain))
return;
- basechain = nft_base_chain(trans->ctx.chain);
+ basechain = nft_base_chain(trans->chain);
basechain->policy = NF_DROP;
}
-static void nft_chain_commit_update(struct nft_trans *trans)
+static void nft_chain_commit_update(struct nft_trans_chain *trans)
{
+ struct nft_table *table = trans->nft_trans_binding.nft_trans.table;
struct nft_base_chain *basechain;
- if (nft_trans_chain_name(trans)) {
- rhltable_remove(&trans->ctx.table->chains_ht,
- &trans->ctx.chain->rhlhead,
+ if (trans->name) {
+ rhltable_remove(&table->chains_ht,
+ &trans->chain->rhlhead,
nft_chain_ht_params);
- swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
- rhltable_insert_key(&trans->ctx.table->chains_ht,
- trans->ctx.chain->name,
- &trans->ctx.chain->rhlhead,
+ swap(trans->chain->name, trans->name);
+ rhltable_insert_key(&table->chains_ht,
+ trans->chain->name,
+ &trans->chain->rhlhead,
nft_chain_ht_params);
}
- if (!nft_is_base_chain(trans->ctx.chain))
+ if (!nft_is_base_chain(trans->chain))
return;
nft_chain_stats_replace(trans);
- basechain = nft_base_chain(trans->ctx.chain);
+ basechain = nft_base_chain(trans->chain);
- switch (nft_trans_chain_policy(trans)) {
+ switch (trans->policy) {
case NF_DROP:
case NF_ACCEPT:
- basechain->policy = nft_trans_chain_policy(trans);
+ basechain->policy = trans->policy;
break;
}
}
-static void nft_obj_commit_update(struct nft_trans *trans)
+static void nft_obj_commit_update(const struct nft_ctx *ctx,
+ struct nft_trans *trans)
{
struct nft_object *newobj;
struct nft_object *obj;
@@ -8580,60 +10171,76 @@ static void nft_obj_commit_update(struct nft_trans *trans)
obj = nft_trans_obj(trans);
newobj = nft_trans_obj_newobj(trans);
- if (obj->ops->update)
- obj->ops->update(obj, newobj);
+ if (WARN_ON_ONCE(!obj->ops->update))
+ return;
- nft_obj_destroy(&trans->ctx, newobj);
+ obj->ops->update(obj, newobj);
+ nft_obj_destroy(ctx, newobj);
}
static void nft_commit_release(struct nft_trans *trans)
{
+ struct nft_ctx ctx = {
+ .net = trans->net,
+ };
+
+ nft_ctx_update(&ctx, trans);
+
switch (trans->msg_type) {
case NFT_MSG_DELTABLE:
- nf_tables_table_destroy(&trans->ctx);
+ case NFT_MSG_DESTROYTABLE:
+ nf_tables_table_destroy(trans->table);
break;
case NFT_MSG_NEWCHAIN:
free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
break;
case NFT_MSG_DELCHAIN:
- nf_tables_chain_destroy(&trans->ctx);
+ case NFT_MSG_DESTROYCHAIN:
+ if (nft_trans_chain_update(trans))
+ nft_hooks_destroy(&nft_trans_chain_hooks(trans));
+ else
+ nf_tables_chain_destroy(nft_trans_chain(trans));
break;
case NFT_MSG_DELRULE:
- nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ case NFT_MSG_DESTROYRULE:
+ nf_tables_rule_destroy(&ctx, nft_trans_rule(trans));
break;
case NFT_MSG_DELSET:
- nft_set_destroy(&trans->ctx, nft_trans_set(trans));
+ case NFT_MSG_DESTROYSET:
+ nft_set_destroy(&ctx, nft_trans_set(trans));
break;
case NFT_MSG_DELSETELEM:
- nf_tables_set_elem_destroy(&trans->ctx,
- nft_trans_elem_set(trans),
- nft_trans_elem(trans).priv);
+ case NFT_MSG_DESTROYSETELEM:
+ nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans));
break;
case NFT_MSG_DELOBJ:
- nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
+ case NFT_MSG_DESTROYOBJ:
+ nft_obj_destroy(&ctx, nft_trans_obj(trans));
break;
case NFT_MSG_DELFLOWTABLE:
+ case NFT_MSG_DESTROYFLOWTABLE:
if (nft_trans_flowtable_update(trans))
- nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+ nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
else
nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
break;
}
if (trans->put_net)
- put_net(trans->ctx.net);
+ put_net(trans->net);
kfree(trans);
}
static void nf_tables_trans_destroy_work(struct work_struct *w)
{
+ struct nftables_pernet *nft_net = container_of(w, struct nftables_pernet, destroy_work);
struct nft_trans *trans, *next;
LIST_HEAD(head);
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_init(&nf_tables_destroy_list, &head);
+ list_splice_init(&nft_net->destroy_list, &head);
spin_unlock(&nf_tables_destroy_list_lock);
if (list_empty(&head))
@@ -8642,14 +10249,16 @@ static void nf_tables_trans_destroy_work(struct work_struct *w)
synchronize_rcu();
list_for_each_entry_safe(trans, next, &head, list) {
- list_del(&trans->list);
+ nft_trans_list_del(trans);
nft_commit_release(trans);
}
}
-void nf_tables_trans_destroy_flush_work(void)
+void nf_tables_trans_destroy_flush_work(struct net *net)
{
- flush_work(&trans_destroy_work);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ flush_work(&nft_net->destroy_work);
}
EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work);
@@ -8680,9 +10289,8 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha
return -ENOMEM;
}
}
- data_size += offsetof(struct nft_rule_dp, data); /* last rule */
- chain->blob_next = nf_tables_chain_alloc_rules(data_size);
+ chain->blob_next = nf_tables_chain_alloc_rules(chain, data_size);
if (!chain->blob_next)
return -ENOMEM;
@@ -8709,7 +10317,7 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha
continue;
}
- if (WARN_ON_ONCE(data + expr->ops->size > data_boundary))
+ if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary))
return -ENOMEM;
memcpy(data + size, expr, expr->ops->size);
@@ -8727,12 +10335,11 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha
chain->blob_next->size += (unsigned long)(data - (void *)prule);
}
- prule = (struct nft_rule_dp *)data;
- data += offsetof(struct nft_rule_dp, data);
if (WARN_ON_ONCE(data > data_boundary))
return -ENOMEM;
- nft_last_rule(chain->blob_next, prule);
+ prule = (struct nft_rule_dp *)data;
+ nft_last_rule(chain, prule);
return 0;
}
@@ -8743,32 +10350,32 @@ static void nf_tables_commit_chain_prepare_cancel(struct net *net)
struct nft_trans *trans, *next;
list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
- struct nft_chain *chain = trans->ctx.chain;
-
if (trans->msg_type == NFT_MSG_NEWRULE ||
trans->msg_type == NFT_MSG_DELRULE) {
+ struct nft_chain *chain = nft_trans_rule_chain(trans);
+
kvfree(chain->blob_next);
chain->blob_next = NULL;
}
}
}
-static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
+static void __nf_tables_commit_chain_free_rules(struct rcu_head *h)
{
- struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
+ struct nft_rule_dp_last *l = container_of(h, struct nft_rule_dp_last, h);
- kvfree(o->blob);
+ kvfree(l->blob);
}
static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob)
{
- struct nft_rules_old *old;
+ struct nft_rule_dp_last *last;
- /* rcu_head is after end marker */
- old = (void *)blob + sizeof(*blob) + blob->size;
- old->blob = blob;
+ /* last rule trailer is after end marker */
+ last = (void *)blob + sizeof(*blob) + blob->size;
+ last->blob = blob;
- call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
+ call_rcu(&last->h, __nf_tables_commit_chain_free_rules);
}
static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
@@ -8834,6 +10441,243 @@ void nft_chain_del(struct nft_chain *chain)
list_del_rcu(&chain->list);
}
+static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx,
+ struct nft_trans_gc *trans)
+{
+ struct nft_elem_priv **priv = trans->priv;
+ unsigned int i;
+
+ for (i = 0; i < trans->count; i++) {
+ nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]);
+ nft_setelem_remove(ctx->net, trans->set, priv[i]);
+ }
+}
+
+void nft_trans_gc_destroy(struct nft_trans_gc *trans)
+{
+ nft_set_put(trans->set);
+ put_net(trans->net);
+ kfree(trans);
+}
+
+static void nft_trans_gc_trans_free(struct rcu_head *rcu)
+{
+ struct nft_elem_priv *elem_priv;
+ struct nft_trans_gc *trans;
+ struct nft_ctx ctx = {};
+ unsigned int i;
+
+ trans = container_of(rcu, struct nft_trans_gc, rcu);
+ ctx.net = read_pnet(&trans->set->net);
+
+ for (i = 0; i < trans->count; i++) {
+ elem_priv = trans->priv[i];
+ if (!nft_setelem_is_catchall(trans->set, elem_priv))
+ atomic_dec(&trans->set->nelems);
+
+ nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv);
+ }
+
+ nft_trans_gc_destroy(trans);
+}
+
+static bool nft_trans_gc_work_done(struct nft_trans_gc *trans)
+{
+ struct nftables_pernet *nft_net;
+ struct nft_ctx ctx = {};
+
+ nft_net = nft_pernet(trans->net);
+
+ mutex_lock(&nft_net->commit_mutex);
+
+ /* Check for race with transaction, otherwise this batch refers to
+ * stale objects that might not be there anymore. Skip transaction if
+ * set has been destroyed from control plane transaction in case gc
+ * worker loses race.
+ */
+ if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) {
+ mutex_unlock(&nft_net->commit_mutex);
+ return false;
+ }
+
+ ctx.net = trans->net;
+ ctx.table = trans->set->table;
+
+ nft_trans_gc_setelem_remove(&ctx, trans);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return true;
+}
+
+static void nft_trans_gc_work(struct work_struct *work)
+{
+ struct nft_trans_gc *trans, *next;
+ LIST_HEAD(trans_gc_list);
+
+ spin_lock(&nf_tables_gc_list_lock);
+ list_splice_init(&nf_tables_gc_list, &trans_gc_list);
+ spin_unlock(&nf_tables_gc_list_lock);
+
+ list_for_each_entry_safe(trans, next, &trans_gc_list, list) {
+ list_del(&trans->list);
+ if (!nft_trans_gc_work_done(trans)) {
+ nft_trans_gc_destroy(trans);
+ continue;
+ }
+ call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+ }
+}
+
+struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
+ unsigned int gc_seq, gfp_t gfp)
+{
+ struct net *net = read_pnet(&set->net);
+ struct nft_trans_gc *trans;
+
+ trans = kzalloc(sizeof(*trans), gfp);
+ if (!trans)
+ return NULL;
+
+ trans->net = maybe_get_net(net);
+ if (!trans->net) {
+ kfree(trans);
+ return NULL;
+ }
+
+ refcount_inc(&set->refs);
+ trans->set = set;
+ trans->seq = gc_seq;
+
+ return trans;
+}
+
+void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv)
+{
+ trans->priv[trans->count++] = priv;
+}
+
+static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
+{
+ spin_lock(&nf_tables_gc_list_lock);
+ list_add_tail(&trans->list, &nf_tables_gc_list);
+ spin_unlock(&nf_tables_gc_list_lock);
+
+ schedule_work(&trans_gc_work);
+}
+
+static int nft_trans_gc_space(struct nft_trans_gc *trans)
+{
+ return NFT_TRANS_GC_BATCHCOUNT - trans->count;
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
+ unsigned int gc_seq, gfp_t gfp)
+{
+ struct nft_set *set;
+
+ if (nft_trans_gc_space(gc))
+ return gc;
+
+ set = gc->set;
+ nft_trans_gc_queue_work(gc);
+
+ return nft_trans_gc_alloc(set, gc_seq, gfp);
+}
+
+void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans)
+{
+ if (trans->count == 0) {
+ nft_trans_gc_destroy(trans);
+ return;
+ }
+
+ nft_trans_gc_queue_work(trans);
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp)
+{
+ struct nft_set *set;
+
+ if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)))
+ return NULL;
+
+ if (nft_trans_gc_space(gc))
+ return gc;
+
+ set = gc->set;
+ call_rcu(&gc->rcu, nft_trans_gc_trans_free);
+
+ return nft_trans_gc_alloc(set, 0, gfp);
+}
+
+void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
+{
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net));
+
+ if (trans->count == 0) {
+ nft_trans_gc_destroy(trans);
+ return;
+ }
+
+ call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+}
+
+struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
+ unsigned int gc_seq)
+{
+ struct nft_set_elem_catchall *catchall;
+ const struct nft_set *set = gc->set;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+
+ if (!nft_set_elem_expired(ext))
+ continue;
+ if (nft_set_elem_is_dead(ext))
+ goto dead_elem;
+
+ nft_set_elem_dead(ext);
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ return NULL;
+
+ nft_trans_gc_elem_add(gc, catchall->elem);
+ }
+
+ return gc;
+}
+
+struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc)
+{
+ struct nft_set_elem_catchall *catchall, *next;
+ u64 tstamp = nft_net_tstamp(gc->net);
+ const struct nft_set *set = gc->set;
+ struct nft_elem_priv *elem_priv;
+ struct nft_set_ext *ext;
+
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net));
+
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+
+ if (!__nft_set_elem_expired(ext, tstamp))
+ continue;
+
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ return NULL;
+
+ elem_priv = catchall->elem;
+ nft_setelem_data_deactivate(gc->net, gc->set, elem_priv);
+ nft_setelem_catchall_destroy(catchall);
+ nft_trans_gc_elem_add(gc, elem_priv);
+ }
+
+ return gc;
+}
+
static void nf_tables_module_autoload_cleanup(struct net *net)
{
struct nftables_pernet *nft_net = nft_pernet(net);
@@ -8867,16 +10711,16 @@ static void nf_tables_commit_release(struct net *net)
trans = list_last_entry(&nft_net->commit_list,
struct nft_trans, list);
- get_net(trans->ctx.net);
+ get_net(trans->net);
WARN_ON_ONCE(trans->put_net);
trans->put_net = true;
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list);
+ list_splice_tail_init(&nft_net->commit_list, &nft_net->destroy_list);
spin_unlock(&nf_tables_destroy_list_lock);
nf_tables_module_autoload_cleanup(net);
- schedule_work(&trans_destroy_work);
+ schedule_work(&nft_net->destroy_work);
mutex_unlock(&nft_net->commit_mutex);
}
@@ -8944,9 +10788,24 @@ static void nf_tables_commit_audit_free(struct list_head *adl)
}
}
+/* nft audit emits the number of elements that get added/removed/updated,
+ * so NEW/DELSETELEM needs to increment based on the total elem count.
+ */
+static unsigned int nf_tables_commit_audit_entrycount(const struct nft_trans *trans)
+{
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSETELEM:
+ case NFT_MSG_DELSETELEM:
+ return nft_trans_container_elem(trans)->nelems;
+ }
+
+ return 1;
+}
+
static void nf_tables_commit_audit_collect(struct list_head *adl,
- struct nft_table *table, u32 op)
+ const struct nft_trans *trans, u32 op)
{
+ const struct nft_table *table = trans->table;
struct nft_audit_data *adp;
list_for_each_entry(adp, adl, list) {
@@ -8956,7 +10815,7 @@ static void nf_tables_commit_audit_collect(struct list_head *adl,
WARN_ONCE(1, "table=%s not expected in commit list", table->name);
return;
found:
- adp->entries++;
+ adp->entries += nf_tables_commit_audit_entrycount(trans);
if (!adp->op || adp->op > op)
adp->op = op;
}
@@ -8978,14 +10837,48 @@ static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation)
}
}
+static void nft_set_commit_update(struct list_head *set_update_list)
+{
+ struct nft_set *set, *next;
+
+ list_for_each_entry_safe(set, next, set_update_list, pending_update) {
+ list_del_init(&set->pending_update);
+
+ if (!set->ops->commit || set->dead)
+ continue;
+
+ set->ops->commit(set);
+ }
+}
+
+static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net)
+{
+ unsigned int gc_seq;
+
+ /* Bump gc counter, it becomes odd, this is the busy mark. */
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+ WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
+
+ return gc_seq;
+}
+
+static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq)
+{
+ WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
+}
+
static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
struct nftables_pernet *nft_net = nft_pernet(net);
+ const struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ struct nft_trans_binding *trans_binding;
struct nft_trans *trans, *next;
+ unsigned int base_seq, gc_seq;
+ LIST_HEAD(set_update_list);
struct nft_trans_elem *te;
struct nft_chain *chain;
struct nft_table *table;
- unsigned int base_seq;
+ struct nft_ctx ctx;
LIST_HEAD(adl);
int err;
@@ -8994,9 +10887,38 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
return 0;
}
+ nft_ctx_init(&ctx, net, skb, nlh, NFPROTO_UNSPEC, NULL, NULL, NULL);
+
+ list_for_each_entry(trans_binding, &nft_net->binding_list, binding_list) {
+ trans = &trans_binding->nft_trans;
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSET:
+ if (!nft_trans_set_update(trans) &&
+ nft_set_is_anonymous(nft_trans_set(trans)) &&
+ !nft_trans_set_bound(trans)) {
+ pr_warn_once("nftables ruleset with unbound set\n");
+ return -EINVAL;
+ }
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (!nft_trans_chain_update(trans) &&
+ nft_chain_binding(nft_trans_chain(trans)) &&
+ !nft_trans_chain_bound(trans)) {
+ pr_warn_once("nftables ruleset with unbound chain\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ WARN_ONCE(1, "Unhandled bind type %d", trans->msg_type);
+ break;
+ }
+ }
+
/* 0. Validate ruleset, otherwise roll back for error reporting. */
- if (nf_tables_validate(net) < 0)
+ if (nf_tables_validate(net) < 0) {
+ nft_net->validate_state = NFT_VALIDATE_DO;
return -EAGAIN;
+ }
err = nft_flow_rule_offload_commit(net);
if (err < 0)
@@ -9004,9 +10926,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
/* 1. Allocate space for next generation rules_gen_X[] */
list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
+ struct nft_table *table = trans->table;
int ret;
- ret = nf_tables_commit_audit_alloc(&adl, trans->ctx.table);
+ ret = nf_tables_commit_audit_alloc(&adl, table);
if (ret) {
nf_tables_commit_chain_prepare_cancel(net);
nf_tables_commit_audit_free(&adl);
@@ -9014,7 +10937,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
}
if (trans->msg_type == NFT_MSG_NEWRULE ||
trans->msg_type == NFT_MSG_DELRULE) {
- chain = trans->ctx.chain;
+ chain = nft_trans_rule_chain(trans);
ret = nf_tables_commit_chain_prepare(net, chain);
if (ret < 0) {
@@ -9035,86 +10958,109 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
* Bump generation counter, invalidate any dump in progress.
* Cannot fail after this point.
*/
- base_seq = READ_ONCE(nft_net->base_seq);
+ base_seq = nft_base_seq(net);
while (++base_seq == 0)
;
- WRITE_ONCE(nft_net->base_seq, base_seq);
+ /* pairs with smp_load_acquire in nft_lookup_eval */
+ smp_store_release(&net->nft.base_seq, base_seq);
+
+ gc_seq = nft_gc_seq_begin(nft_net);
/* step 3. Start new generation, rules_gen_X now in use. */
net->nft.gencursor = nft_gencursor_next(net);
list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
- nf_tables_commit_audit_collect(&adl, trans->ctx.table,
- trans->msg_type);
+ struct nft_table *table = trans->table;
+
+ nft_ctx_update(&ctx, trans);
+
+ nf_tables_commit_audit_collect(&adl, trans, trans->msg_type);
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) {
+ if (!(table->flags & __NFT_TABLE_F_UPDATE)) {
nft_trans_destroy(trans);
break;
}
- if (trans->ctx.table->flags & NFT_TABLE_F_DORMANT)
- nf_tables_table_disable(net, trans->ctx.table);
+ if (table->flags & NFT_TABLE_F_DORMANT)
+ nf_tables_table_disable(net, table);
- trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE;
+ table->flags &= ~__NFT_TABLE_F_UPDATE;
} else {
- nft_clear(net, trans->ctx.table);
+ nft_clear(net, table);
}
- nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE);
+ nf_tables_table_notify(&ctx, NFT_MSG_NEWTABLE);
nft_trans_destroy(trans);
break;
case NFT_MSG_DELTABLE:
- list_del_rcu(&trans->ctx.table->list);
- nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
+ case NFT_MSG_DESTROYTABLE:
+ list_del_rcu(&table->list);
+ nf_tables_table_notify(&ctx, trans->msg_type);
break;
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans)) {
- nft_chain_commit_update(trans);
- nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ nft_chain_commit_update(nft_trans_container_chain(trans));
+ nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN,
+ &nft_trans_chain_hooks(trans));
+ list_splice(&nft_trans_chain_hooks(trans),
+ &nft_trans_basechain(trans)->hook_list);
/* trans destroyed after rcu grace period */
} else {
- nft_chain_commit_drop_policy(trans);
- nft_clear(net, trans->ctx.chain);
- nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ nft_chain_commit_drop_policy(nft_trans_container_chain(trans));
+ nft_clear(net, nft_trans_chain(trans));
+ nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL);
nft_trans_destroy(trans);
}
break;
case NFT_MSG_DELCHAIN:
- nft_chain_del(trans->ctx.chain);
- nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
- nf_tables_unregister_hook(trans->ctx.net,
- trans->ctx.table,
- trans->ctx.chain);
+ case NFT_MSG_DESTROYCHAIN:
+ if (nft_trans_chain_update(trans)) {
+ nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
+ &nft_trans_chain_hooks(trans));
+ if (!(table->flags & NFT_TABLE_F_DORMANT)) {
+ nft_netdev_unregister_hooks(net,
+ &nft_trans_chain_hooks(trans),
+ true);
+ }
+ } else {
+ nft_chain_del(nft_trans_chain(trans));
+ nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
+ NULL);
+ nf_tables_unregister_hook(ctx.net, ctx.table,
+ nft_trans_chain(trans));
+ }
break;
case NFT_MSG_NEWRULE:
- nft_clear(trans->ctx.net, nft_trans_rule(trans));
- nf_tables_rule_notify(&trans->ctx,
- nft_trans_rule(trans),
+ nft_clear(net, nft_trans_rule(trans));
+ nf_tables_rule_notify(&ctx, nft_trans_rule(trans),
NFT_MSG_NEWRULE);
- if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
nft_flow_rule_destroy(nft_trans_flow_rule(trans));
nft_trans_destroy(trans);
break;
case NFT_MSG_DELRULE:
+ case NFT_MSG_DESTROYRULE:
list_del_rcu(&nft_trans_rule(trans)->list);
- nf_tables_rule_notify(&trans->ctx,
- nft_trans_rule(trans),
- NFT_MSG_DELRULE);
- nft_rule_expr_deactivate(&trans->ctx,
- nft_trans_rule(trans),
+ nf_tables_rule_notify(&ctx, nft_trans_rule(trans),
+ trans->msg_type);
+ nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans),
NFT_TRANS_COMMIT);
- if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_NEWSET:
+ list_del(&nft_trans_container_set(trans)->list_trans_newset);
if (nft_trans_set_update(trans)) {
struct nft_set *set = nft_trans_set(trans);
WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans));
WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans));
+
+ if (nft_trans_set_size(trans))
+ WRITE_ONCE(set->size, nft_trans_set_size(trans));
} else {
nft_clear(net, nft_trans_set(trans));
/* This avoids hitting -EBUSY when deleting the table
@@ -9122,62 +11068,68 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
*/
if (nft_set_is_anonymous(nft_trans_set(trans)) &&
!list_empty(&nft_trans_set(trans)->bindings))
- trans->ctx.table->use--;
+ nft_use_dec(&table->use);
}
- nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
+ nf_tables_set_notify(&ctx, nft_trans_set(trans),
NFT_MSG_NEWSET, GFP_KERNEL);
nft_trans_destroy(trans);
break;
case NFT_MSG_DELSET:
+ case NFT_MSG_DESTROYSET:
+ nft_trans_set(trans)->dead = 1;
list_del_rcu(&nft_trans_set(trans)->list);
- nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
- NFT_MSG_DELSET, GFP_KERNEL);
+ nf_tables_set_notify(&ctx, nft_trans_set(trans),
+ trans->msg_type, GFP_KERNEL);
break;
case NFT_MSG_NEWSETELEM:
- te = (struct nft_trans_elem *)trans->data;
+ te = nft_trans_container_elem(trans);
+
+ nft_trans_elems_add(&ctx, te);
- nft_setelem_activate(net, te->set, &te->elem);
- nf_tables_setelem_notify(&trans->ctx, te->set,
- &te->elem,
- NFT_MSG_NEWSETELEM);
+ if (te->set->ops->commit &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_DELSETELEM:
- te = (struct nft_trans_elem *)trans->data;
-
- nf_tables_setelem_notify(&trans->ctx, te->set,
- &te->elem,
- NFT_MSG_DELSETELEM);
- nft_setelem_remove(net, te->set, &te->elem);
- if (!nft_setelem_is_catchall(te->set, &te->elem)) {
- atomic_dec(&te->set->nelems);
- te->set->ndeact--;
+ case NFT_MSG_DESTROYSETELEM:
+ te = nft_trans_container_elem(trans);
+
+ nft_trans_elems_remove(&ctx, te);
+
+ if (te->set->ops->commit &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
}
break;
case NFT_MSG_NEWOBJ:
if (nft_trans_obj_update(trans)) {
- nft_obj_commit_update(trans);
- nf_tables_obj_notify(&trans->ctx,
+ nft_obj_commit_update(&ctx, trans);
+ nf_tables_obj_notify(&ctx,
nft_trans_obj(trans),
NFT_MSG_NEWOBJ);
} else {
nft_clear(net, nft_trans_obj(trans));
- nf_tables_obj_notify(&trans->ctx,
+ nf_tables_obj_notify(&ctx,
nft_trans_obj(trans),
NFT_MSG_NEWOBJ);
nft_trans_destroy(trans);
}
break;
case NFT_MSG_DELOBJ:
+ case NFT_MSG_DESTROYOBJ:
nft_obj_del(nft_trans_obj(trans));
- nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
- NFT_MSG_DELOBJ);
+ nf_tables_obj_notify(&ctx, nft_trans_obj(trans),
+ trans->msg_type);
break;
case NFT_MSG_NEWFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
nft_trans_flowtable(trans)->data.flags =
nft_trans_flowtable_flags(trans);
- nf_tables_flowtable_notify(&trans->ctx,
+ nf_tables_flowtable_notify(&ctx,
nft_trans_flowtable(trans),
&nft_trans_flowtable_hooks(trans),
NFT_MSG_NEWFLOWTABLE);
@@ -9185,37 +11137,45 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
&nft_trans_flowtable(trans)->hook_list);
} else {
nft_clear(net, nft_trans_flowtable(trans));
- nf_tables_flowtable_notify(&trans->ctx,
+ nf_tables_flowtable_notify(&ctx,
nft_trans_flowtable(trans),
- &nft_trans_flowtable(trans)->hook_list,
+ NULL,
NFT_MSG_NEWFLOWTABLE);
}
nft_trans_destroy(trans);
break;
case NFT_MSG_DELFLOWTABLE:
+ case NFT_MSG_DESTROYFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
- nf_tables_flowtable_notify(&trans->ctx,
+ nf_tables_flowtable_notify(&ctx,
nft_trans_flowtable(trans),
&nft_trans_flowtable_hooks(trans),
- NFT_MSG_DELFLOWTABLE);
+ trans->msg_type);
nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
&nft_trans_flowtable_hooks(trans));
} else {
list_del_rcu(&nft_trans_flowtable(trans)->list);
- nf_tables_flowtable_notify(&trans->ctx,
+ nf_tables_flowtable_notify(&ctx,
nft_trans_flowtable(trans),
- &nft_trans_flowtable(trans)->hook_list,
- NFT_MSG_DELFLOWTABLE);
+ NULL,
+ trans->msg_type);
nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
&nft_trans_flowtable(trans)->hook_list);
}
break;
}
}
+ nft_set_commit_update(&set_update_list);
+
nft_commit_notify(net, NETLINK_CB(skb).portid);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
- nf_tables_commit_audit_log(&adl, nft_net->base_seq);
+ nf_tables_commit_audit_log(&adl, nft_base_seq(net));
+
+ nft_gc_seq_end(nft_net, gc_seq);
+ nft_net->validate_state = NFT_VALIDATE_SKIP;
nf_tables_commit_release(net);
return 0;
@@ -9239,29 +11199,35 @@ static void nf_tables_module_autoload(struct net *net)
static void nf_tables_abort_release(struct nft_trans *trans)
{
+ struct nft_ctx ctx = { };
+
+ nft_ctx_update(&ctx, trans);
+
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
- nf_tables_table_destroy(&trans->ctx);
+ nf_tables_table_destroy(trans->table);
break;
case NFT_MSG_NEWCHAIN:
- nf_tables_chain_destroy(&trans->ctx);
+ if (nft_trans_chain_update(trans))
+ nft_hooks_destroy(&nft_trans_chain_hooks(trans));
+ else
+ nf_tables_chain_destroy(nft_trans_chain(trans));
break;
case NFT_MSG_NEWRULE:
- nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ nf_tables_rule_destroy(&ctx, nft_trans_rule(trans));
break;
case NFT_MSG_NEWSET:
- nft_set_destroy(&trans->ctx, nft_trans_set(trans));
+ nft_set_destroy(&ctx, nft_trans_set(trans));
break;
case NFT_MSG_NEWSETELEM:
- nft_set_elem_destroy(nft_trans_elem_set(trans),
- nft_trans_elem(trans).priv, true);
+ nft_trans_set_elem_destroy(&ctx, nft_trans_container_elem(trans));
break;
case NFT_MSG_NEWOBJ:
- nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
+ nft_obj_destroy(&ctx, nft_trans_obj(trans));
break;
case NFT_MSG_NEWFLOWTABLE:
if (nft_trans_flowtable_update(trans))
- nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+ nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
else
nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
break;
@@ -9269,96 +11235,145 @@ static void nf_tables_abort_release(struct nft_trans *trans)
kfree(trans);
}
+static void nft_set_abort_update(struct list_head *set_update_list)
+{
+ struct nft_set *set, *next;
+
+ list_for_each_entry_safe(set, next, set_update_list, pending_update) {
+ list_del_init(&set->pending_update);
+
+ if (!set->ops->abort)
+ continue;
+
+ set->ops->abort(set);
+ }
+}
+
static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
{
struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
+ LIST_HEAD(set_update_list);
struct nft_trans_elem *te;
+ struct nft_ctx ctx = {
+ .net = net,
+ };
+ int err = 0;
if (action == NFNL_ABORT_VALIDATE &&
nf_tables_validate(net) < 0)
- return -EAGAIN;
+ err = -EAGAIN;
list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list,
list) {
+ struct nft_table *table = trans->table;
+
+ nft_ctx_update(&ctx, trans);
+
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) {
+ if (!(table->flags & __NFT_TABLE_F_UPDATE)) {
nft_trans_destroy(trans);
break;
}
- if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_DORMANT) {
- nf_tables_table_disable(net, trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
- } else if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_AWAKEN) {
- trans->ctx.table->flags &= ~NFT_TABLE_F_DORMANT;
+ if (table->flags & __NFT_TABLE_F_WAS_DORMANT) {
+ nf_tables_table_disable(net, table);
+ table->flags |= NFT_TABLE_F_DORMANT;
+ } else if (table->flags & __NFT_TABLE_F_WAS_AWAKEN) {
+ table->flags &= ~NFT_TABLE_F_DORMANT;
}
- trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE;
+ if (table->flags & __NFT_TABLE_F_WAS_ORPHAN) {
+ table->flags &= ~NFT_TABLE_F_OWNER;
+ table->nlpid = 0;
+ }
+ table->flags &= ~__NFT_TABLE_F_UPDATE;
nft_trans_destroy(trans);
} else {
- list_del_rcu(&trans->ctx.table->list);
+ list_del_rcu(&table->list);
}
break;
case NFT_MSG_DELTABLE:
- nft_clear(trans->ctx.net, trans->ctx.table);
+ case NFT_MSG_DESTROYTABLE:
+ nft_clear(trans->net, table);
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans)) {
+ if (!(table->flags & NFT_TABLE_F_DORMANT)) {
+ nft_netdev_unregister_hooks(net,
+ &nft_trans_chain_hooks(trans),
+ true);
+ }
free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
nft_trans_destroy(trans);
} else {
- if (nft_chain_is_bound(trans->ctx.chain)) {
+ if (nft_trans_chain_bound(trans)) {
nft_trans_destroy(trans);
break;
}
- trans->ctx.table->use--;
- nft_chain_del(trans->ctx.chain);
- nf_tables_unregister_hook(trans->ctx.net,
- trans->ctx.table,
- trans->ctx.chain);
+ nft_use_dec_restore(&table->use);
+ nft_chain_del(nft_trans_chain(trans));
+ nf_tables_unregister_hook(trans->net, table,
+ nft_trans_chain(trans));
}
break;
case NFT_MSG_DELCHAIN:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, trans->ctx.chain);
+ case NFT_MSG_DESTROYCHAIN:
+ if (nft_trans_chain_update(trans)) {
+ list_splice(&nft_trans_chain_hooks(trans),
+ &nft_trans_basechain(trans)->hook_list);
+ } else {
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_chain(trans));
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWRULE:
- trans->ctx.chain->use--;
+ if (nft_trans_rule_bound(trans)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+ nft_use_dec_restore(&nft_trans_rule_chain(trans)->use);
list_del_rcu(&nft_trans_rule(trans)->list);
- nft_rule_expr_deactivate(&trans->ctx,
+ nft_rule_expr_deactivate(&ctx,
nft_trans_rule(trans),
NFT_TRANS_ABORT);
- if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_DELRULE:
- trans->ctx.chain->use++;
- nft_clear(trans->ctx.net, nft_trans_rule(trans));
- nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans));
- if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ case NFT_MSG_DESTROYRULE:
+ nft_use_inc_restore(&nft_trans_rule_chain(trans)->use);
+ nft_clear(trans->net, nft_trans_rule(trans));
+ nft_rule_expr_activate(&ctx, nft_trans_rule(trans));
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
nft_flow_rule_destroy(nft_trans_flow_rule(trans));
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWSET:
+ list_del(&nft_trans_container_set(trans)->list_trans_newset);
if (nft_trans_set_update(trans)) {
nft_trans_destroy(trans);
break;
}
- trans->ctx.table->use--;
+ nft_use_dec_restore(&table->use);
if (nft_trans_set_bound(trans)) {
nft_trans_destroy(trans);
break;
}
+ nft_trans_set(trans)->dead = 1;
list_del_rcu(&nft_trans_set(trans)->list);
break;
case NFT_MSG_DELSET:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, nft_trans_set(trans));
+ case NFT_MSG_DESTROYSET:
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_set(trans));
+ if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_activate(&ctx, nft_trans_set(trans));
+
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWSETELEM:
@@ -9366,85 +11381,108 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
nft_trans_destroy(trans);
break;
}
- te = (struct nft_trans_elem *)trans->data;
- nft_setelem_remove(net, te->set, &te->elem);
- if (!nft_setelem_is_catchall(te->set, &te->elem))
- atomic_dec(&te->set->nelems);
+ te = nft_trans_container_elem(trans);
+ if (!nft_trans_elems_new_abort(&ctx, te)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+
+ if (te->set->ops->abort &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
break;
case NFT_MSG_DELSETELEM:
- te = (struct nft_trans_elem *)trans->data;
+ case NFT_MSG_DESTROYSETELEM:
+ te = nft_trans_container_elem(trans);
- nft_setelem_data_activate(net, te->set, &te->elem);
- nft_setelem_activate(net, te->set, &te->elem);
- if (!nft_setelem_is_catchall(te->set, &te->elem))
- te->set->ndeact--;
+ nft_trans_elems_destroy_abort(&ctx, te);
+ if (te->set->ops->abort &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWOBJ:
if (nft_trans_obj_update(trans)) {
- nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans));
+ nft_obj_destroy(&ctx, nft_trans_obj_newobj(trans));
nft_trans_destroy(trans);
} else {
- trans->ctx.table->use--;
+ nft_use_dec_restore(&table->use);
nft_obj_del(nft_trans_obj(trans));
}
break;
case NFT_MSG_DELOBJ:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, nft_trans_obj(trans));
+ case NFT_MSG_DESTROYOBJ:
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_obj(trans));
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
&nft_trans_flowtable_hooks(trans));
} else {
- trans->ctx.table->use--;
+ nft_use_dec_restore(&table->use);
list_del_rcu(&nft_trans_flowtable(trans)->list);
nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
&nft_trans_flowtable(trans)->hook_list);
}
break;
case NFT_MSG_DELFLOWTABLE:
+ case NFT_MSG_DESTROYFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
list_splice(&nft_trans_flowtable_hooks(trans),
&nft_trans_flowtable(trans)->hook_list);
} else {
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_flowtable(trans));
}
nft_trans_destroy(trans);
break;
}
}
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list));
+
+ nft_set_abort_update(&set_update_list);
+
synchronize_rcu();
list_for_each_entry_safe_reverse(trans, next,
&nft_net->commit_list, list) {
- list_del(&trans->list);
+ nft_trans_list_del(trans);
nf_tables_abort_release(trans);
}
- if (action == NFNL_ABORT_AUTOLOAD)
- nf_tables_module_autoload(net);
- else
- nf_tables_module_autoload_cleanup(net);
-
- return 0;
-}
-
-static void nf_tables_cleanup(struct net *net)
-{
- nft_validate_state_update(net, NFT_VALIDATE_SKIP);
+ return err;
}
static int nf_tables_abort(struct net *net, struct sk_buff *skb,
enum nfnl_abort_action action)
{
struct nftables_pernet *nft_net = nft_pernet(net);
- int ret = __nf_tables_abort(net, action);
+ unsigned int gc_seq;
+ int ret;
+
+ gc_seq = nft_gc_seq_begin(nft_net);
+ ret = __nf_tables_abort(net, action);
+ nft_gc_seq_end(nft_net, gc_seq);
+
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
+
+ /* module autoload needs to happen after GC sequence update because it
+ * temporarily releases and grabs mutex again.
+ */
+ if (action == NFNL_ABORT_AUTOLOAD)
+ nf_tables_module_autoload(net);
+ else
+ nf_tables_module_autoload_cleanup(net);
mutex_unlock(&nft_net->commit_mutex);
@@ -9457,8 +11495,9 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid)
bool genid_ok;
mutex_lock(&nft_net->commit_mutex);
+ nft_net->tstamp = get_jiffies_64();
- genid_ok = genid == 0 || nft_net->base_seq == genid;
+ genid_ok = genid == 0 || nft_base_seq(net) == genid;
if (!genid_ok)
mutex_unlock(&nft_net->commit_mutex);
@@ -9473,7 +11512,6 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
.cb = nf_tables_cb,
.commit = nf_tables_commit,
.abort = nf_tables_abort,
- .cleanup = nf_tables_cleanup,
.valid_genid = nf_tables_valid_genid,
.owner = THIS_MODULE,
};
@@ -9510,143 +11548,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain,
}
EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);
-/*
- * Loop detection - walk through the ruleset beginning at the destination chain
- * of a new jump until either the source chain is reached (loop) or all
- * reachable chains have been traversed.
- *
- * The loop check is performed whenever a new jump verdict is added to an
- * expression or verdict map or a verdict map is bound to a new chain.
- */
-
-static int nf_tables_check_loops(const struct nft_ctx *ctx,
- const struct nft_chain *chain);
-
-static int nft_check_loops(const struct nft_ctx *ctx,
- const struct nft_set_ext *ext)
-{
- const struct nft_data *data;
- int ret;
-
- data = nft_set_ext_data(ext);
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- ret = nf_tables_check_loops(ctx, data->verdict.chain);
- break;
- default:
- ret = 0;
- break;
- }
-
- return ret;
-}
-
-static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
-{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
-
- if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
- *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
- return 0;
-
- return nft_check_loops(ctx, ext);
-}
-
-static int nft_set_catchall_loops(const struct nft_ctx *ctx,
- struct nft_set *set)
-{
- u8 genmask = nft_genmask_next(ctx->net);
- struct nft_set_elem_catchall *catchall;
- struct nft_set_ext *ext;
- int ret = 0;
-
- list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
- ext = nft_set_elem_ext(set, catchall->elem);
- if (!nft_set_elem_active(ext, genmask))
- continue;
-
- ret = nft_check_loops(ctx, ext);
- if (ret < 0)
- return ret;
- }
-
- return ret;
-}
-
-static int nf_tables_check_loops(const struct nft_ctx *ctx,
- const struct nft_chain *chain)
-{
- const struct nft_rule *rule;
- const struct nft_expr *expr, *last;
- struct nft_set *set;
- struct nft_set_binding *binding;
- struct nft_set_iter iter;
-
- if (ctx->chain == chain)
- return -ELOOP;
-
- list_for_each_entry(rule, &chain->rules, list) {
- nft_rule_for_each_expr(expr, last, rule) {
- struct nft_immediate_expr *priv;
- const struct nft_data *data;
- int err;
-
- if (strcmp(expr->ops->type->name, "immediate"))
- continue;
-
- priv = nft_expr_priv(expr);
- if (priv->dreg != NFT_REG_VERDICT)
- continue;
-
- data = &priv->data;
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- err = nf_tables_check_loops(ctx,
- data->verdict.chain);
- if (err < 0)
- return err;
- break;
- default:
- break;
- }
- }
- }
-
- list_for_each_entry(set, &ctx->table->sets, list) {
- if (!nft_is_active_next(ctx->net, set))
- continue;
- if (!(set->flags & NFT_SET_MAP) ||
- set->dtype != NFT_DATA_VERDICT)
- continue;
-
- list_for_each_entry(binding, &set->bindings, list) {
- if (!(binding->flags & NFT_SET_MAP) ||
- binding->chain != chain)
- continue;
-
- iter.genmask = nft_genmask_next(ctx->net);
- iter.skip = 0;
- iter.count = 0;
- iter.err = 0;
- iter.fn = nf_tables_loop_check_setelem;
-
- set->ops->walk(ctx, set, &iter);
- if (!iter.err)
- iter.err = nft_set_catchall_loops(ctx, set);
-
- if (iter.err < 0)
- return iter.err;
- }
- }
-
- return 0;
-}
-
/**
* nft_parse_u32_check - fetch u32 attribute and check for maximum value
*
@@ -9725,10 +11626,11 @@ static int nft_validate_register_load(enum nft_registers reg, unsigned int len)
return 0;
}
-int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len)
+int nft_parse_register_load(const struct nft_ctx *ctx,
+ const struct nlattr *attr, u8 *sreg, u32 len)
{
- u32 reg;
- int err;
+ int err, invalid_reg;
+ u32 reg, next_register;
err = nft_parse_register(attr, &reg);
if (err < 0)
@@ -9738,11 +11640,36 @@ int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len)
if (err < 0)
return err;
+ next_register = DIV_ROUND_UP(len, NFT_REG32_SIZE) + reg;
+
+ /* Can't happen: nft_validate_register_load() should have failed */
+ if (WARN_ON_ONCE(next_register > NFT_REG32_NUM))
+ return -EINVAL;
+
+ /* find first register that did not see an earlier store. */
+ invalid_reg = find_next_zero_bit(ctx->reg_inited, NFT_REG32_NUM, reg);
+
+ /* invalid register within the range that we're loading from? */
+ if (invalid_reg < next_register)
+ return -ENODATA;
+
*sreg = reg;
return 0;
}
EXPORT_SYMBOL_GPL(nft_parse_register_load);
+static void nft_saw_register_store(const struct nft_ctx *__ctx,
+ int reg, unsigned int len)
+{
+ unsigned int registers = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ struct nft_ctx *ctx = (struct nft_ctx *)__ctx;
+
+ if (WARN_ON_ONCE(len == 0 || reg < 0))
+ return;
+
+ bitmap_set(ctx->reg_inited, reg, registers);
+}
+
static int nft_validate_register_store(const struct nft_ctx *ctx,
enum nft_registers reg,
const struct nft_data *data,
@@ -9759,13 +11686,16 @@ static int nft_validate_register_store(const struct nft_ctx *ctx,
if (data != NULL &&
(data->verdict.code == NFT_GOTO ||
data->verdict.code == NFT_JUMP)) {
- err = nf_tables_check_loops(ctx, data->verdict.chain);
+ err = nft_chain_validate(ctx, data->verdict.chain);
if (err < 0)
return err;
}
- return 0;
+ break;
default:
+ if (type != NFT_DATA_VALUE)
+ return -EINVAL;
+
if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
return -EINVAL;
if (len == 0)
@@ -9774,10 +11704,11 @@ static int nft_validate_register_store(const struct nft_ctx *ctx,
sizeof_field(struct nft_regs, data))
return -ERANGE;
- if (data != NULL && type != NFT_DATA_VALUE)
- return -EINVAL;
- return 0;
+ break;
}
+
+ nft_saw_register_store(ctx, reg, len);
+ return 0;
}
int nft_parse_register_store(const struct nft_ctx *ctx,
@@ -9823,19 +11754,16 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
if (!tb[NFTA_VERDICT_CODE])
return -EINVAL;
+
+ /* zero padding hole for memcmp */
+ memset(data, 0, sizeof(*data));
data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
switch (data->verdict.code) {
- default:
- switch (data->verdict.code & NF_VERDICT_MASK) {
- case NF_ACCEPT:
- case NF_DROP:
- case NF_QUEUE:
- break;
- default:
- return -EINVAL;
- }
- fallthrough;
+ case NF_ACCEPT:
+ case NF_DROP:
+ case NF_QUEUE:
+ break;
case NFT_CONTINUE:
case NFT_BREAK:
case NFT_RETURN:
@@ -9848,7 +11776,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
genmask);
} else if (tb[NFTA_VERDICT_CHAIN_ID]) {
chain = nft_chain_lookup_byid(ctx->net, ctx->table,
- tb[NFTA_VERDICT_CHAIN_ID]);
+ tb[NFTA_VERDICT_CHAIN_ID],
+ genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
} else {
@@ -9864,10 +11793,13 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
if (desc->flags & NFT_DATA_DESC_SETELEM &&
chain->flags & NFT_CHAIN_BINDING)
return -EINVAL;
+ if (!nft_use_inc(&chain->use))
+ return -EMFILE;
- chain->use++;
data->verdict.chain = chain;
break;
+ default:
+ return -EINVAL;
}
desc->len = sizeof(data->verdict);
@@ -9878,22 +11810,12 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
static void nft_verdict_uninit(const struct nft_data *data)
{
struct nft_chain *chain;
- struct nft_rule *rule;
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
chain = data->verdict.chain;
- chain->use--;
-
- if (!nft_chain_is_bound(chain))
- break;
-
- chain->table->use--;
- list_for_each_entry(rule, &chain->rules, list)
- chain->use--;
-
- nft_chain_del(chain);
+ nft_use_dec(&chain->use);
break;
}
}
@@ -10052,27 +11974,6 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
}
EXPORT_SYMBOL_GPL(nft_data_dump);
-int __nft_release_basechain(struct nft_ctx *ctx)
-{
- struct nft_rule *rule, *nr;
-
- if (WARN_ON(!nft_is_base_chain(ctx->chain)))
- return 0;
-
- nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
- list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
- list_del(&rule->list);
- ctx->chain->use--;
- nf_tables_rule_release(ctx, rule);
- }
- nft_chain_del(ctx->chain);
- ctx->table->use--;
- nf_tables_chain_destroy(ctx);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(__nft_release_basechain);
-
static void __nft_release_hook(struct net *net, struct nft_table *table)
{
struct nft_flowtable *flowtable;
@@ -10081,7 +11982,8 @@ static void __nft_release_hook(struct net *net, struct nft_table *table)
list_for_each_entry(chain, &table->chains, list)
__nf_tables_unregister_hook(net, table, chain, true);
list_for_each_entry(flowtable, &table->flowtables, list)
- __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list,
+ __nft_unregister_flowtable_net_hooks(net, flowtable,
+ &flowtable->hook_list,
true);
}
@@ -10113,35 +12015,40 @@ static void __nft_release_table(struct net *net, struct nft_table *table)
ctx.family = table->family;
ctx.table = table;
list_for_each_entry(chain, &table->chains, list) {
+ if (nft_chain_binding(chain))
+ continue;
+
ctx.chain = chain;
list_for_each_entry_safe(rule, nr, &chain->rules, list) {
list_del(&rule->list);
- chain->use--;
+ nft_use_dec(&chain->use);
nf_tables_rule_release(&ctx, rule);
}
}
list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
list_del(&flowtable->list);
- table->use--;
+ nft_use_dec(&table->use);
nf_tables_flowtable_destroy(flowtable);
}
list_for_each_entry_safe(set, ns, &table->sets, list) {
list_del(&set->list);
- table->use--;
+ nft_use_dec(&table->use);
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(&ctx, set);
+
nft_set_destroy(&ctx, set);
}
list_for_each_entry_safe(obj, ne, &table->objects, list) {
nft_obj_del(obj);
- table->use--;
+ nft_use_dec(&table->use);
nft_obj_destroy(&ctx, obj);
}
list_for_each_entry_safe(chain, nc, &table->chains, list) {
- ctx.chain = chain;
nft_chain_del(chain);
- table->use--;
- nf_tables_chain_destroy(&ctx);
+ nft_use_dec(&table->use);
+ nf_tables_chain_destroy(chain);
}
- nf_tables_table_destroy(&ctx);
+ nf_tables_table_destroy(table);
}
static void __nft_release_tables(struct net *net)
@@ -10168,6 +12075,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
struct net *net = n->net;
unsigned int deleted;
bool restart = false;
+ unsigned int gc_seq;
if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER)
return NOTIFY_DONE;
@@ -10175,12 +12083,18 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
nft_net = nft_pernet(net);
deleted = 0;
mutex_lock(&nft_net->commit_mutex);
- if (!list_empty(&nf_tables_destroy_list))
- rcu_barrier();
+
+ gc_seq = nft_gc_seq_begin(nft_net);
+
+ nf_tables_trans_destroy_flush_work(net);
again:
list_for_each_entry(table, &nft_net->tables, list) {
if (nft_table_has_owner(table) &&
n->portid == table->nlpid) {
+ if (table->flags & NFT_TABLE_F_PERSIST) {
+ table->flags &= ~NFT_TABLE_F_OWNER;
+ continue;
+ }
__nft_release_hook(net, table);
list_del_rcu(&table->list);
to_delete[deleted++] = table;
@@ -10197,6 +12111,8 @@ again:
if (restart)
goto again;
}
+ nft_gc_seq_end(nft_net, gc_seq);
+
mutex_unlock(&nft_net->commit_mutex);
return NOTIFY_DONE;
@@ -10212,11 +12128,16 @@ static int __net_init nf_tables_init_net(struct net *net)
INIT_LIST_HEAD(&nft_net->tables);
INIT_LIST_HEAD(&nft_net->commit_list);
+ INIT_LIST_HEAD(&nft_net->destroy_list);
+ INIT_LIST_HEAD(&nft_net->commit_set_list);
+ INIT_LIST_HEAD(&nft_net->binding_list);
INIT_LIST_HEAD(&nft_net->module_list);
INIT_LIST_HEAD(&nft_net->notify_list);
mutex_init(&nft_net->commit_mutex);
- nft_net->base_seq = 1;
+ net->nft.base_seq = 1;
+ nft_net->gc_seq = 0;
nft_net->validate_state = NFT_VALIDATE_SKIP;
+ INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work);
return 0;
}
@@ -10233,22 +12154,41 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net)
static void __net_exit nf_tables_exit_net(struct net *net)
{
struct nftables_pernet *nft_net = nft_pernet(net);
+ unsigned int gc_seq;
mutex_lock(&nft_net->commit_mutex);
- if (!list_empty(&nft_net->commit_list) ||
- !list_empty(&nft_net->module_list))
- __nf_tables_abort(net, NFNL_ABORT_NONE);
+
+ gc_seq = nft_gc_seq_begin(nft_net);
+
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list));
+
+ if (!list_empty(&nft_net->module_list))
+ nf_tables_module_autoload_cleanup(net);
+
+ cancel_work_sync(&nft_net->destroy_work);
__nft_release_tables(net);
+
+ nft_gc_seq_end(nft_net, gc_seq);
+
mutex_unlock(&nft_net->commit_mutex);
+
WARN_ON_ONCE(!list_empty(&nft_net->tables));
WARN_ON_ONCE(!list_empty(&nft_net->module_list));
WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->destroy_list));
+}
+
+static void nf_tables_exit_batch(struct list_head *net_exit_list)
+{
+ flush_work(&trans_gc_work);
}
static struct pernet_operations nf_tables_net_ops = {
.init = nf_tables_init_net,
.pre_exit = nf_tables_pre_exit_net,
.exit = nf_tables_exit_net,
+ .exit_batch = nf_tables_exit_batch,
.id = &nf_tables_net_id,
.size = sizeof(struct nftables_pernet),
};
@@ -10257,6 +12197,14 @@ static int __init nf_tables_module_init(void)
{
int err;
+ BUILD_BUG_ON(offsetof(struct nft_trans_table, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_chain, nft_trans_binding.nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_rule, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_set, nft_trans_binding.nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_elem, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_obj, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_flowtable, nft_trans) != 0);
+
err = register_pernet_subsys(&nf_tables_net_ops);
if (err < 0)
return err;
@@ -10320,7 +12268,7 @@ static void __exit nf_tables_module_exit(void)
nft_chain_filter_fini();
nft_chain_route_fini();
unregister_pernet_subsys(&nf_tables_net_ops);
- cancel_work_sync(&trans_destroy_work);
+ cancel_work_sync(&trans_gc_work);
rcu_barrier();
rhltable_destroy(&nft_objname_ht);
nf_tables_core_module_exit();
@@ -10331,4 +12279,5 @@ module_exit(nf_tables_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Framework for packet filtering and classification");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 709a736c301c..6557a4018c09 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -21,39 +21,54 @@
#include <net/netfilter/nf_log.h>
#include <net/netfilter/nft_meta.h>
-static noinline void __nft_trace_packet(struct nft_traceinfo *info,
- const struct nft_chain *chain,
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static struct static_key_false nf_tables_skip_direct_calls;
+
+static inline bool nf_skip_indirect_calls(void)
+{
+ return static_branch_likely(&nf_tables_skip_direct_calls);
+}
+
+static inline void __init nf_skip_indirect_calls_enable(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE))
+ static_branch_enable(&nf_tables_skip_direct_calls);
+}
+#else
+static inline void nf_skip_indirect_calls_enable(void) { }
+#endif /* CONFIG_MITIGATION_RETPOLINE */
+
+static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
+ const struct nft_verdict *verdict,
+ const struct nft_rule_dp *rule,
+ struct nft_traceinfo *info,
enum nft_trace_types type)
{
if (!info->trace || !info->nf_trace)
return;
- info->chain = chain;
info->type = type;
- nft_trace_notify(info);
+ nft_trace_notify(pkt, verdict, rule, info);
}
static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
+ struct nft_verdict *verdict,
struct nft_traceinfo *info,
- const struct nft_chain *chain,
const struct nft_rule_dp *rule,
enum nft_trace_types type)
{
if (static_branch_unlikely(&nft_trace_enabled)) {
info->nf_trace = pkt->skb->nf_trace;
- info->rule = rule;
- __nft_trace_packet(info, chain, type);
+ __nft_trace_packet(pkt, verdict, rule, info, type);
}
}
static inline void nft_trace_copy_nftrace(const struct nft_pktinfo *pkt,
struct nft_traceinfo *info)
{
- if (static_branch_unlikely(&nft_trace_enabled)) {
- if (info->trace)
- info->nf_trace = pkt->skb->nf_trace;
- }
+ if (static_branch_unlikely(&nft_trace_enabled))
+ info->nf_trace = pkt->skb->nf_trace;
}
static void nft_bitwise_fast_eval(const struct nft_expr *expr,
@@ -90,13 +105,14 @@ static void nft_cmp16_fast_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
}
-static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
- const struct nft_chain *chain,
+static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info,
+ const struct nft_rule_dp *rule,
const struct nft_regs *regs)
{
enum nft_trace_types type;
- switch (regs->verdict.code) {
+ switch (regs->verdict.code & NF_VERDICT_MASK) {
case NFT_CONTINUE:
case NFT_RETURN:
type = NFT_TRACETYPE_RETURN;
@@ -109,22 +125,20 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
type = NFT_TRACETYPE_RULE;
if (info->trace)
- info->nf_trace = info->pkt->skb->nf_trace;
+ info->nf_trace = pkt->skb->nf_trace;
break;
}
- __nft_trace_packet(info, chain, type);
+ __nft_trace_packet(pkt, &regs->verdict, rule, info, type);
}
-static inline void nft_trace_verdict(struct nft_traceinfo *info,
- const struct nft_chain *chain,
+static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info,
const struct nft_rule_dp *rule,
const struct nft_regs *regs)
{
- if (static_branch_unlikely(&nft_trace_enabled)) {
- info->rule = rule;
- __nft_trace_verdict(info, chain, regs);
- }
+ if (static_branch_unlikely(&nft_trace_enabled))
+ __nft_trace_verdict(pkt, info, rule, regs);
}
static bool nft_payload_fast_eval(const struct nft_expr *expr,
@@ -141,7 +155,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
else {
if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
return false;
- ptr = skb_network_header(skb) + nft_thoff(pkt);
+ ptr = skb->data + nft_thoff(pkt);
}
ptr += priv->offset;
@@ -183,17 +197,20 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
}
struct nft_jumpstack {
- const struct nft_chain *chain;
const struct nft_rule_dp *rule;
- const struct nft_rule_dp *last_rule;
};
static void expr_call_ops_eval(const struct nft_expr *expr,
struct nft_regs *regs,
struct nft_pktinfo *pkt)
{
-#ifdef CONFIG_RETPOLINE
- unsigned long e = (unsigned long)expr->ops->eval;
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ unsigned long e;
+
+ if (nf_skip_indirect_calls())
+ goto indirect_call;
+
+ e = (unsigned long)expr->ops->eval;
#define X(e, fun) \
do { if ((e) == (unsigned long)(fun)) \
return fun(expr, regs, pkt); } while (0)
@@ -203,21 +220,26 @@ static void expr_call_ops_eval(const struct nft_expr *expr,
X(e, nft_counter_eval);
X(e, nft_meta_get_eval);
X(e, nft_lookup_eval);
+#if IS_ENABLED(CONFIG_NFT_CT)
+ X(e, nft_ct_get_fast_eval);
+#endif
X(e, nft_range_eval);
X(e, nft_immediate_eval);
X(e, nft_byteorder_eval);
X(e, nft_dynset_eval);
X(e, nft_rt_get_eval);
X(e, nft_bitwise_eval);
+ X(e, nft_objref_eval);
+ X(e, nft_objref_map_eval);
#undef X
-#endif /* CONFIG_RETPOLINE */
+indirect_call:
+#endif /* CONFIG_MITIGATION_RETPOLINE */
expr->ops->eval(expr, regs, pkt);
}
#define nft_rule_expr_first(rule) (struct nft_expr *)&rule->data[0]
#define nft_rule_expr_next(expr) ((void *)expr) + expr->ops->size
#define nft_rule_expr_last(rule) (struct nft_expr *)&rule->data[rule->dlen]
-#define nft_rule_next(rule) (void *)rule + sizeof(*rule) + rule->dlen
#define nft_rule_dp_for_each_expr(expr, last, rule) \
for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \
@@ -228,10 +250,10 @@ unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
- const struct nft_rule_dp *rule, *last_rule;
const struct net *net = nft_net(pkt);
const struct nft_expr *expr, *last;
- struct nft_regs regs = {};
+ const struct nft_rule_dp *rule;
+ struct nft_regs regs;
unsigned int stackptr = 0;
struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
bool genbit = READ_ONCE(net->nft.gencursor);
@@ -240,7 +262,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
info.trace = false;
if (static_branch_unlikely(&nft_trace_enabled))
- nft_trace_init(&info, pkt, &regs.verdict, basechain);
+ nft_trace_init(&info, pkt, basechain);
do_chain:
if (genbit)
blob = rcu_dereference(chain->blob_gen_1);
@@ -248,10 +270,9 @@ do_chain:
blob = rcu_dereference(chain->blob_gen_0);
rule = (struct nft_rule_dp *)blob->data;
- last_rule = (void *)blob->data + blob->size;
next_rule:
regs.verdict.code = NFT_CONTINUE;
- for (; rule < last_rule; rule = nft_rule_next(rule)) {
+ for (; !rule->is_last ; rule = nft_rule_next(rule)) {
nft_rule_dp_for_each_expr(expr, last, rule) {
if (expr->ops == &nft_cmp_fast_ops)
nft_cmp_fast_eval(expr, &regs);
@@ -273,30 +294,29 @@ next_rule:
nft_trace_copy_nftrace(pkt, &info);
continue;
case NFT_CONTINUE:
- nft_trace_packet(pkt, &info, chain, rule,
+ nft_trace_packet(pkt, &regs.verdict, &info, rule,
NFT_TRACETYPE_RULE);
continue;
}
break;
}
- nft_trace_verdict(&info, chain, rule, &regs);
+ nft_trace_verdict(pkt, &info, rule, &regs);
switch (regs.verdict.code & NF_VERDICT_MASK) {
case NF_ACCEPT:
- case NF_DROP:
case NF_QUEUE:
case NF_STOLEN:
return regs.verdict.code;
+ case NF_DROP:
+ return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM);
}
switch (regs.verdict.code) {
case NFT_JUMP:
if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE))
return NF_DROP;
- jumpstack[stackptr].chain = chain;
jumpstack[stackptr].rule = nft_rule_next(rule);
- jumpstack[stackptr].last_rule = last_rule;
stackptr++;
fallthrough;
case NFT_GOTO:
@@ -311,17 +331,18 @@ next_rule:
if (stackptr > 0) {
stackptr--;
- chain = jumpstack[stackptr].chain;
rule = jumpstack[stackptr].rule;
- last_rule = jumpstack[stackptr].last_rule;
goto next_rule;
}
- nft_trace_packet(pkt, &info, basechain, NULL, NFT_TRACETYPE_POLICY);
+ nft_trace_packet(pkt, &regs.verdict, &info, NULL, NFT_TRACETYPE_POLICY);
if (static_branch_unlikely(&nft_counters_enabled))
nft_update_chain_stats(basechain, pkt);
+ if (nft_base_chain(basechain)->policy == NF_DROP)
+ return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM);
+
return nft_base_chain(basechain)->policy;
}
EXPORT_SYMBOL_GPL(nft_do_chain);
@@ -369,6 +390,8 @@ int __init nf_tables_core_module_init(void)
goto err;
}
+ nf_skip_indirect_calls_enable();
+
return 0;
err:
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 910ef881c3b8..fd30e205de84 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -35,12 +35,12 @@ void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow,
struct nft_flow_key *mask = &match->mask;
struct nft_flow_key *key = &match->key;
- if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL))
+ if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL))
return;
key->control.addr_type = addr_type;
mask->control.addr_type = 0xffff;
- match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL);
match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] =
offsetof(struct nft_flow_key, control);
}
@@ -59,7 +59,7 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx,
.mask = match->mask.basic.n_proto,
};
- if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_VLAN) &&
+ if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) &&
(match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) ||
match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) {
match->key.basic.n_proto = match->key.cvlan.vlan_tpid;
@@ -70,8 +70,9 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx,
match->mask.vlan.vlan_tpid = ethertype.mask;
match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] =
offsetof(struct nft_flow_key, cvlan);
- match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN);
- } else if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC) &&
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN);
+ } else if (match->dissector.used_keys &
+ BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) &&
(match->key.basic.n_proto == htons(ETH_P_8021Q) ||
match->key.basic.n_proto == htons(ETH_P_8021AD))) {
match->key.basic.n_proto = match->key.vlan.vlan_tpid;
@@ -80,7 +81,7 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx,
match->mask.vlan.vlan_tpid = ethertype.mask;
match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] =
offsetof(struct nft_flow_key, vlan);
- match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_VLAN);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN);
}
}
@@ -219,6 +220,7 @@ static int nft_chain_offload_priority(const struct nft_base_chain *basechain)
bool nft_chain_offload_support(const struct nft_base_chain *basechain)
{
+ struct nf_hook_ops *ops;
struct net_device *dev;
struct nft_hook *hook;
@@ -226,13 +228,16 @@ bool nft_chain_offload_support(const struct nft_base_chain *basechain)
return false;
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (hook->ops.pf != NFPROTO_NETDEV ||
- hook->ops.hooknum != NF_NETDEV_INGRESS)
- return false;
-
- dev = hook->ops.dev;
- if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists())
- return false;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (ops->pf != NFPROTO_NETDEV ||
+ ops->hooknum != NF_NETDEV_INGRESS)
+ return false;
+
+ dev = ops->dev;
+ if (!dev->netdev_ops->ndo_setup_tc &&
+ !flow_indr_dev_exists())
+ return false;
+ }
}
return true;
@@ -454,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain,
const struct net_device *this_dev,
enum flow_block_command cmd)
{
- struct net_device *dev;
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
int err, i = 0;
list_for_each_entry(hook, &basechain->hook_list, list) {
- dev = hook->ops.dev;
- if (this_dev && this_dev != dev)
- continue;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (this_dev && this_dev != ops->dev)
+ continue;
- err = nft_chain_offload_cmd(basechain, dev, cmd);
- if (err < 0 && cmd == FLOW_BLOCK_BIND) {
- if (!this_dev)
- goto err_flow_block;
+ err = nft_chain_offload_cmd(basechain, ops->dev, cmd);
+ if (err < 0 && cmd == FLOW_BLOCK_BIND) {
+ if (!this_dev)
+ goto err_flow_block;
- return err;
+ return err;
+ }
+ i++;
}
- i++;
}
return 0;
err_flow_block:
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (i-- <= 0)
- break;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (i-- <= 0)
+ break;
- dev = hook->ops.dev;
- nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND);
+ nft_chain_offload_cmd(basechain, ops->dev,
+ FLOW_BLOCK_UNBIND);
+ }
}
return err;
}
@@ -512,38 +520,38 @@ static void nft_flow_rule_offload_abort(struct net *net,
int err = 0;
list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) {
- if (trans->ctx.family != NFPROTO_NETDEV)
+ if (trans->table->family != NFPROTO_NETDEV)
continue;
switch (trans->msg_type) {
case NFT_MSG_NEWCHAIN:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) ||
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) ||
nft_trans_chain_update(trans))
continue;
- err = nft_flow_offload_chain(trans->ctx.chain, NULL,
+ err = nft_flow_offload_chain(nft_trans_chain(trans), NULL,
FLOW_BLOCK_UNBIND);
break;
case NFT_MSG_DELCHAIN:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- err = nft_flow_offload_chain(trans->ctx.chain, NULL,
+ err = nft_flow_offload_chain(nft_trans_chain(trans), NULL,
FLOW_BLOCK_BIND);
break;
case NFT_MSG_NEWRULE:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- err = nft_flow_offload_rule(trans->ctx.chain,
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
nft_trans_rule(trans),
NULL, FLOW_CLS_DESTROY);
break;
case NFT_MSG_DELRULE:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- err = nft_flow_offload_rule(trans->ctx.chain,
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
nft_trans_rule(trans),
nft_trans_flow_rule(trans),
FLOW_CLS_REPLACE);
@@ -563,46 +571,46 @@ int nft_flow_rule_offload_commit(struct net *net)
u8 policy;
list_for_each_entry(trans, &nft_net->commit_list, list) {
- if (trans->ctx.family != NFPROTO_NETDEV)
+ if (trans->table->family != NFPROTO_NETDEV)
continue;
switch (trans->msg_type) {
case NFT_MSG_NEWCHAIN:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) ||
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) ||
nft_trans_chain_update(trans))
continue;
policy = nft_trans_chain_policy(trans);
- err = nft_flow_offload_chain(trans->ctx.chain, &policy,
+ err = nft_flow_offload_chain(nft_trans_chain(trans), &policy,
FLOW_BLOCK_BIND);
break;
case NFT_MSG_DELCHAIN:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
policy = nft_trans_chain_policy(trans);
- err = nft_flow_offload_chain(trans->ctx.chain, &policy,
+ err = nft_flow_offload_chain(nft_trans_chain(trans), &policy,
FLOW_BLOCK_UNBIND);
break;
case NFT_MSG_NEWRULE:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- if (trans->ctx.flags & NLM_F_REPLACE ||
- !(trans->ctx.flags & NLM_F_APPEND)) {
+ if (trans->flags & NLM_F_REPLACE ||
+ !(trans->flags & NLM_F_APPEND)) {
err = -EOPNOTSUPP;
break;
}
- err = nft_flow_offload_rule(trans->ctx.chain,
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
nft_trans_rule(trans),
nft_trans_flow_rule(trans),
FLOW_CLS_REPLACE);
break;
case NFT_MSG_DELRULE:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- err = nft_flow_offload_rule(trans->ctx.chain,
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
nft_trans_rule(trans),
NULL, FLOW_CLS_DESTROY);
break;
@@ -637,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *n
found = NULL;
basechain = nft_base_chain(chain);
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (hook->ops.dev != dev)
+ if (!nft_hook_find_ops(hook, dev))
continue;
found = hook;
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 1163ba9c1401..a88abae5a9de 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -15,6 +15,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
@@ -90,6 +91,52 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb,
return 0;
}
+static int nf_trace_fill_ct_info(struct sk_buff *nlskb,
+ const struct sk_buff *skb)
+{
+ const struct nf_ct_hook *ct_hook;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ u32 state;
+
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (!ct_hook)
+ return 0;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct) {
+ if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */
+ return 0;
+
+ state = NF_CT_STATE_UNTRACKED_BIT;
+ } else {
+ state = NF_CT_STATE_BIT(ctinfo);
+ }
+
+ if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state)))
+ return -1;
+
+ if (ct) {
+ u32 id = ct_hook->get_id(&ct->ct_general);
+ u32 status = READ_ONCE(ct->status);
+ u8 dir = CTINFO2DIR(ctinfo);
+
+ if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir))
+ return -1;
+
+ if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id))
+ return -1;
+
+ /* Kernel implementation detail, withhold this from userspace for now */
+ status &= ~IPS_NAT_CLASH;
+
+ if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status)))
+ return -1;
+ }
+
+ return 0;
+}
+
static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
const struct nft_pktinfo *pkt)
{
@@ -124,9 +171,11 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
}
static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
+ const struct nft_verdict *verdict,
+ const struct nft_rule_dp *rule,
const struct nft_traceinfo *info)
{
- if (!info->rule || info->rule->is_last)
+ if (!rule || rule->is_last)
return 0;
/* a continue verdict with ->type == RETURN means that this is
@@ -135,15 +184,16 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
* Since no rule matched, the ->rule pointer is invalid.
*/
if (info->type == NFT_TRACETYPE_RETURN &&
- info->verdict->code == NFT_CONTINUE)
+ verdict->code == NFT_CONTINUE)
return 0;
return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE,
- cpu_to_be64(info->rule->handle),
+ cpu_to_be64(rule->handle),
NFTA_TRACE_PAD);
}
-static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
+static bool nft_trace_have_verdict_chain(const struct nft_verdict *verdict,
+ struct nft_traceinfo *info)
{
switch (info->type) {
case NFT_TRACETYPE_RETURN:
@@ -153,7 +203,7 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
return false;
}
- switch (info->verdict->code) {
+ switch (verdict->code) {
case NFT_JUMP:
case NFT_GOTO:
break;
@@ -164,9 +214,31 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
return true;
}
-void nft_trace_notify(struct nft_traceinfo *info)
+static const struct nft_chain *nft_trace_get_chain(const struct nft_rule_dp *rule,
+ const struct nft_traceinfo *info)
+{
+ const struct nft_rule_dp_last *last;
+
+ if (!rule)
+ return &info->basechain->chain;
+
+ while (!rule->is_last)
+ rule = nft_rule_next(rule);
+
+ last = (const struct nft_rule_dp_last *)rule;
+
+ if (WARN_ON_ONCE(!last->chain))
+ return &info->basechain->chain;
+
+ return last->chain;
+}
+
+void nft_trace_notify(const struct nft_pktinfo *pkt,
+ const struct nft_verdict *verdict,
+ const struct nft_rule_dp *rule,
+ struct nft_traceinfo *info)
{
- const struct nft_pktinfo *pkt = info->pkt;
+ const struct nft_chain *chain;
struct nlmsghdr *nlh;
struct sk_buff *skb;
unsigned int size;
@@ -176,14 +248,20 @@ void nft_trace_notify(struct nft_traceinfo *info)
if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
return;
+ chain = nft_trace_get_chain(rule, info);
+
size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
- nla_total_size(strlen(info->chain->table->name)) +
- nla_total_size(strlen(info->chain->name)) +
+ nla_total_size(strlen(chain->table->name)) +
+ nla_total_size(strlen(chain->name)) +
nla_total_size_64bit(sizeof(__be64)) + /* rule handle */
nla_total_size(sizeof(__be32)) + /* trace type */
nla_total_size(0) + /* VERDICT, nested */
nla_total_size(sizeof(u32)) + /* verdict code */
- nla_total_size(sizeof(u32)) + /* id */
+ nla_total_size(sizeof(u32)) + /* ct id */
+ nla_total_size(sizeof(u8)) + /* ct direction */
+ nla_total_size(sizeof(u32)) + /* ct state */
+ nla_total_size(sizeof(u32)) + /* ct status */
+ nla_total_size(sizeof(u32)) + /* trace id */
nla_total_size(NFT_TRACETYPE_LL_HSIZE) +
nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) +
nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) +
@@ -195,8 +273,8 @@ void nft_trace_notify(struct nft_traceinfo *info)
nla_total_size(sizeof(u32)) + /* nfproto */
nla_total_size(sizeof(u32)); /* policy */
- if (nft_trace_have_verdict_chain(info))
- size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */
+ if (nft_trace_have_verdict_chain(verdict, info))
+ size += nla_total_size(strlen(verdict->chain->name)); /* jump target */
skb = nlmsg_new(size, GFP_ATOMIC);
if (!skb)
@@ -217,13 +295,13 @@ void nft_trace_notify(struct nft_traceinfo *info)
if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid))
goto nla_put_failure;
- if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name))
+ if (nla_put_string(skb, NFTA_TRACE_CHAIN, chain->name))
goto nla_put_failure;
- if (nla_put_string(skb, NFTA_TRACE_TABLE, info->chain->table->name))
+ if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name))
goto nla_put_failure;
- if (nf_trace_fill_rule_info(skb, info))
+ if (nf_trace_fill_rule_info(skb, verdict, rule, info))
goto nla_put_failure;
switch (info->type) {
@@ -231,17 +309,21 @@ void nft_trace_notify(struct nft_traceinfo *info)
case __NFT_TRACETYPE_MAX:
break;
case NFT_TRACETYPE_RETURN:
- case NFT_TRACETYPE_RULE:
- if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict))
+ case NFT_TRACETYPE_RULE: {
+ unsigned int v;
+
+ if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, verdict))
goto nla_put_failure;
/* pkt->skb undefined iff NF_STOLEN, disable dump */
- if (info->verdict->code == NF_STOLEN)
+ v = verdict->code & NF_VERDICT_MASK;
+ if (v == NF_STOLEN)
info->packet_dumped = true;
else
mark = pkt->skb->mark;
break;
+ }
case NFT_TRACETYPE_POLICY:
mark = pkt->skb->mark;
@@ -260,6 +342,10 @@ void nft_trace_notify(struct nft_traceinfo *info)
if (nf_trace_fill_pkt_info(skb, pkt))
goto nla_put_failure;
+
+ if (nf_trace_fill_ct_info(skb, pkt->skb))
+ goto nla_put_failure;
+
info->packet_dumped = true;
}
@@ -273,7 +359,6 @@ void nft_trace_notify(struct nft_traceinfo *info)
}
void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
- const struct nft_verdict *verdict,
const struct nft_chain *chain)
{
static siphash_key_t trace_key __read_mostly;
@@ -283,13 +368,11 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
info->trace = true;
info->nf_trace = pkt->skb->nf_trace;
info->packet_dumped = false;
- info->pkt = pkt;
- info->verdict = verdict;
net_get_random_once(&trace_key, sizeof(trace_key));
info->skbid = (u32)siphash_3u32(hash32_ptr(skb),
- skb_get_hash(skb),
+ skb_get_hash_net(nft_net(pkt), skb),
skb->skb_iif,
&trace_key);
}
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 6d18fb346868..811d02b4c4f7 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -29,6 +29,7 @@
#include <net/netlink.h>
#include <net/netns/generic.h>
+#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
MODULE_LICENSE("GPL");
@@ -375,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;
struct netlink_ext_ack extack;
+ struct nlmsghdr *onlh = nlh;
LIST_HEAD(err_list);
u32 status;
int err;
@@ -385,6 +387,7 @@ replay:
status = 0;
replay_abort:
skb = netlink_skb_clone(oskb, GFP_KERNEL);
+ nlh = onlh;
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM, NULL);
@@ -401,31 +404,36 @@ replay_abort:
{
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
}
if (!ss->valid_genid || !ss->commit || !ss->abort) {
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
if (!try_module_get(ss->owner)) {
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
if (!ss->valid_genid(net, genid)) {
module_put(ss->owner);
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -ERESTART, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
nfnl_unlock(subsys_id);
+ if (nlh->nlmsg_flags & NLM_F_ACK) {
+ memset(&extack, 0, sizeof(extack));
+ nfnl_err_add(&err_list, nlh, 0, &extack);
+ }
+
while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;
@@ -511,7 +519,7 @@ replay_abort:
err = nla_parse_deprecated(cda,
ss->cb[cb_id].attr_count,
attr, attrlen,
- ss->cb[cb_id].policy, NULL);
+ ss->cb[cb_id].policy, &extack);
if (err < 0)
goto ack;
@@ -532,7 +540,8 @@ ack:
* processed, this avoids that the same error is
* reported several times when replaying the batch.
*/
- if (nfnl_err_add(&err_list, nlh, err, &extack) < 0) {
+ if (err == -ENOMEM ||
+ nfnl_err_add(&err_list, nlh, err, &extack) < 0) {
/* We failed to enqueue an error, reset the
* list of errors and send OOM to userspace
* pointing to the batch header.
@@ -560,7 +569,7 @@ done:
if (status & NFNL_BATCH_REPLAY) {
ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD);
nfnl_err_reset(&err_list);
- kfree_skb(skb);
+ consume_skb(skb);
module_put(ss->owner);
goto replay;
} else if (status == NFNL_BATCH_DONE) {
@@ -571,6 +580,9 @@ done:
} else if (err) {
ss->abort(net, oskb, NFNL_ABORT_NONE);
netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
+ } else if (nlh->nlmsg_flags & NLM_F_ACK) {
+ memset(&extack, 0, sizeof(extack));
+ nfnl_err_add(&err_list, nlh, 0, &extack);
}
} else {
enum nfnl_abort_action abort_action;
@@ -583,17 +595,15 @@ done:
err = ss->abort(net, oskb, abort_action);
if (err == -EAGAIN) {
nfnl_err_reset(&err_list);
- kfree_skb(skb);
+ consume_skb(skb);
module_put(ss->owner);
status |= NFNL_BATCH_FAILURE;
goto replay_abort;
}
}
- if (ss->cleanup)
- ss->cleanup(net);
nfnl_err_deliver(&err_list, oskb);
- kfree_skb(skb);
+ consume_skb(skb);
module_put(ss->owner);
}
@@ -685,12 +695,12 @@ static void nfnetlink_bind_event(struct net *net, unsigned int group)
group_bit = (1 << group);
spin_lock(&nfnl_grp_active_lock);
- v = READ_ONCE(net->ct.ctnetlink_has_listener);
+ v = READ_ONCE(nf_ctnetlink_has_listener);
if ((v & group_bit) == 0) {
v |= group_bit;
/* read concurrently without nfnl_grp_active_lock held. */
- WRITE_ONCE(net->ct.ctnetlink_has_listener, v);
+ WRITE_ONCE(nf_ctnetlink_has_listener, v);
}
spin_unlock(&nfnl_grp_active_lock);
@@ -744,12 +754,12 @@ static void nfnetlink_unbind(struct net *net, int group)
spin_lock(&nfnl_grp_active_lock);
if (!nfnetlink_has_listeners(net, group)) {
- u8 v = READ_ONCE(net->ct.ctnetlink_has_listener);
+ u8 v = READ_ONCE(nf_ctnetlink_has_listener);
v &= ~group_bit;
/* read concurrently without nfnl_grp_active_lock held. */
- WRITE_ONCE(net->ct.ctnetlink_has_listener, v);
+ WRITE_ONCE(nf_ctnetlink_has_listener, v);
}
spin_unlock(&nfnl_grp_active_lock);
#endif
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index f466af4f8531..38d75484e531 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -366,8 +366,7 @@ static int cttimeout_default_set(struct sk_buff *skb,
__u8 l4num;
int ret;
- if (!cda[CTA_TIMEOUT_L3PROTO] ||
- !cda[CTA_TIMEOUT_L4PROTO] ||
+ if (!cda[CTA_TIMEOUT_L4PROTO] ||
!cda[CTA_TIMEOUT_DATA])
return -EINVAL;
@@ -462,11 +461,6 @@ static int cttimeout_default_get(struct sk_buff *skb,
case IPPROTO_UDPLITE:
timeouts = nf_udp_pernet(info->net)->timeouts;
break;
- case IPPROTO_DCCP:
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- timeouts = nf_dccp_pernet(info->net)->dccp_timeout;
-#endif
- break;
case IPPROTO_ICMPV6:
timeouts = &nf_icmpv6_pernet(info->net)->timeout;
break;
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
index 8120aadf6a0f..92d869317cba 100644
--- a/net/netfilter/nfnetlink_hook.c
+++ b/net/netfilter/nfnetlink_hook.c
@@ -5,6 +5,7 @@
* Author: Florian Westphal <fw@strlen.de>
*/
+#include <linux/bpf.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/kernel.h>
@@ -57,60 +58,136 @@ struct nfnl_dump_hook_data {
u8 hook;
};
-static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb,
- const struct nfnl_dump_hook_data *ctx,
- unsigned int seq,
- const struct nf_hook_ops *ops)
+static struct nlattr *nfnl_start_info_type(struct sk_buff *nlskb, enum nfnl_hook_chaintype t)
{
- struct net *net = sock_net(nlskb->sk);
- struct nlattr *nest, *nest2;
- struct nft_chain *chain;
- int ret = 0;
+ struct nlattr *nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO);
+ int ret;
- if (ops->hook_ops_type != NF_HOOK_OP_NF_TABLES)
- return 0;
+ if (!nest)
+ return NULL;
- chain = ops->priv;
- if (WARN_ON_ONCE(!chain))
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, htonl(t));
+ if (ret == 0)
+ return nest;
+
+ nla_nest_cancel(nlskb, nest);
+ return NULL;
+}
+
+static int nfnl_hook_put_bpf_prog_info(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ unsigned int seq,
+ const struct bpf_prog *prog)
+{
+ struct nlattr *nest, *nest2;
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_NETFILTER_BPF_LINK))
return 0;
- if (!nft_is_active(net, chain))
+ if (WARN_ON_ONCE(!prog))
return 0;
- nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO);
+ nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_BPF);
if (!nest)
return -EMSGSIZE;
- ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE,
- htonl(NFNL_HOOK_TYPE_NFTABLES));
- if (ret)
- goto cancel_nest;
-
nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
if (!nest2)
goto cancel_nest;
- ret = nla_put_string(nlskb, NFNLA_CHAIN_TABLE, chain->table->name);
- if (ret)
- goto cancel_nest;
-
- ret = nla_put_string(nlskb, NFNLA_CHAIN_NAME, chain->name);
- if (ret)
- goto cancel_nest;
-
- ret = nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, chain->table->family);
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_BPF_ID, htonl(prog->aux->id));
if (ret)
goto cancel_nest;
nla_nest_end(nlskb, nest2);
nla_nest_end(nlskb, nest);
- return ret;
+ return 0;
cancel_nest:
nla_nest_cancel(nlskb, nest);
return -EMSGSIZE;
}
+static int nfnl_hook_put_nft_info_desc(struct sk_buff *nlskb, const char *tname,
+ const char *name, u8 family)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
+ if (!nest ||
+ nla_put_string(nlskb, NFNLA_CHAIN_TABLE, tname) ||
+ nla_put_string(nlskb, NFNLA_CHAIN_NAME, name) ||
+ nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, family)) {
+ nla_nest_cancel(nlskb, nest);
+ return -EMSGSIZE;
+ }
+ nla_nest_end(nlskb, nest);
+ return 0;
+}
+
+static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ unsigned int seq,
+ struct nft_chain *chain)
+{
+ struct net *net = sock_net(nlskb->sk);
+ struct nlattr *nest;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(!chain))
+ return 0;
+
+ if (!nft_is_active(net, chain))
+ return 0;
+
+ nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFTABLES);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = nfnl_hook_put_nft_info_desc(nlskb, chain->table->name,
+ chain->name, chain->table->family);
+ if (ret) {
+ nla_nest_cancel(nlskb, nest);
+ return ret;
+ }
+
+ nla_nest_end(nlskb, nest);
+ return 0;
+}
+
+static int nfnl_hook_put_nft_ft_info(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ unsigned int seq,
+ struct nf_flowtable *nf_ft)
+{
+ struct nft_flowtable *ft =
+ container_of(nf_ft, struct nft_flowtable, data);
+ struct net *net = sock_net(nlskb->sk);
+ struct nlattr *nest;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(!nf_ft))
+ return 0;
+
+ if (!nft_is_active(net, ft))
+ return 0;
+
+ nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFT_FLOWTABLE);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = nfnl_hook_put_nft_info_desc(nlskb, ft->table->name,
+ ft->name, ft->table->family);
+ if (ret) {
+ nla_nest_cancel(nlskb, nest);
+ return ret;
+ }
+
+ nla_nest_end(nlskb, nest);
+ return 0;
+}
+
static int nfnl_hook_dump_one(struct sk_buff *nlskb,
const struct nfnl_dump_hook_data *ctx,
const struct nf_hook_ops *ops,
@@ -171,7 +248,23 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb,
if (ret)
goto nla_put_failure;
- ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops);
+ switch (ops->hook_ops_type) {
+ case NF_HOOK_OP_NF_TABLES:
+ ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops->priv);
+ break;
+ case NF_HOOK_OP_BPF:
+ ret = nfnl_hook_put_bpf_prog_info(nlskb, ctx, seq, ops->priv);
+ break;
+ case NF_HOOK_OP_NFT_FT:
+ ret = nfnl_hook_put_nft_ft_info(nlskb, ctx, seq, ops->priv);
+ break;
+ case NF_HOOK_OP_UNDEFINED:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
if (ret)
goto nla_put_failure;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index d97eb280cb2e..bfcb9cd335bf 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -103,9 +103,9 @@ static inline u_int8_t instance_hashfn(u_int16_t group_num)
}
static struct nfulnl_instance *
-__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num)
+__instance_lookup(const struct nfnl_log_net *log, u16 group_num)
{
- struct hlist_head *head;
+ const struct hlist_head *head;
struct nfulnl_instance *inst;
head = &log->instance_table[instance_hashfn(group_num)];
@@ -123,15 +123,25 @@ instance_get(struct nfulnl_instance *inst)
}
static struct nfulnl_instance *
-instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num)
+instance_lookup_get_rcu(const struct nfnl_log_net *log, u16 group_num)
{
struct nfulnl_instance *inst;
- rcu_read_lock_bh();
inst = __instance_lookup(log, group_num);
if (inst && !refcount_inc_not_zero(&inst->use))
inst = NULL;
- rcu_read_unlock_bh();
+
+ return inst;
+}
+
+static struct nfulnl_instance *
+instance_lookup_get(const struct nfnl_log_net *log, u16 group_num)
+{
+ struct nfulnl_instance *inst;
+
+ rcu_read_lock();
+ inst = instance_lookup_get_rcu(log, group_num);
+ rcu_read_unlock();
return inst;
}
@@ -371,7 +381,7 @@ static void
__nfulnl_flush(struct nfulnl_instance *inst)
{
/* timer holds a reference */
- if (del_timer(&inst->timer))
+ if (timer_delete(&inst->timer))
instance_put(inst);
if (inst->skb)
__nfulnl_send(inst);
@@ -380,7 +390,7 @@ __nfulnl_flush(struct nfulnl_instance *inst)
static void
nfulnl_timer(struct timer_list *t)
{
- struct nfulnl_instance *inst = from_timer(inst, t, timer);
+ struct nfulnl_instance *inst = timer_container_of(inst, t, timer);
spin_lock_bh(&inst->lock);
if (inst->skb)
@@ -460,7 +470,6 @@ __build_packet_message(struct nfnl_log_net *log,
sk_buff_data_t old_tail = inst->skb->tail;
struct sock *sk;
const unsigned char *hwhdrp;
- ktime_t tstamp;
nlh = nfnl_msg_put(inst->skb, 0, 0,
nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
@@ -499,7 +508,7 @@ __build_packet_message(struct nfnl_log_net *log,
htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
goto nla_put_failure;
} else {
- struct net_device *physindev;
+ int physinif;
/* Case 2: indev is bridge group, we need to look for
* physical device (when called from ipv4) */
@@ -507,10 +516,10 @@ __build_packet_message(struct nfnl_log_net *log,
htonl(indev->ifindex)))
goto nla_put_failure;
- physindev = nf_bridge_get_physindev(skb);
- if (physindev &&
+ physinif = nf_bridge_get_physinif(skb);
+ if (physinif &&
nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
- htonl(physindev->ifindex)))
+ htonl(physinif)))
goto nla_put_failure;
}
#endif
@@ -589,10 +598,9 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
}
- tstamp = skb_tstamp_cond(skb, false);
- if (hooknum <= NF_INET_FORWARD && tstamp) {
+ if (hooknum <= NF_INET_FORWARD) {
+ struct timespec64 kts = ktime_to_timespec64(skb_tstamp_cond(skb, true));
struct nfulnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -690,15 +698,15 @@ nfulnl_log_packet(struct net *net,
unsigned int plen = 0;
struct nfnl_log_net *log = nfnl_log_pernet(net);
const struct nfnl_ct_hook *nfnl_ct = NULL;
+ enum ip_conntrack_info ctinfo = 0;
struct nf_conn *ct = NULL;
- enum ip_conntrack_info ctinfo;
if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
li = li_user;
else
li = &default_loginfo;
- inst = instance_lookup_get(log, li->u.ulog.group);
+ inst = instance_lookup_get_rcu(log, li->u.ulog.group);
if (!inst)
return;
@@ -1030,7 +1038,7 @@ static struct hlist_node *get_first(struct net *net, struct iter_state *st)
struct hlist_head *head = &log->instance_table[st->bucket];
if (!hlist_empty(head))
- return rcu_dereference_bh(hlist_first_rcu(head));
+ return rcu_dereference(hlist_first_rcu(head));
}
return NULL;
}
@@ -1038,7 +1046,7 @@ static struct hlist_node *get_first(struct net *net, struct iter_state *st)
static struct hlist_node *get_next(struct net *net, struct iter_state *st,
struct hlist_node *h)
{
- h = rcu_dereference_bh(hlist_next_rcu(h));
+ h = rcu_dereference(hlist_next_rcu(h));
while (!h) {
struct nfnl_log_net *log;
struct hlist_head *head;
@@ -1048,7 +1056,7 @@ static struct hlist_node *get_next(struct net *net, struct iter_state *st,
log = nfnl_log_pernet(net);
head = &log->instance_table[st->bucket];
- h = rcu_dereference_bh(hlist_first_rcu(head));
+ h = rcu_dereference(hlist_first_rcu(head));
}
return h;
}
@@ -1066,9 +1074,9 @@ static struct hlist_node *get_idx(struct net *net, struct iter_state *st,
}
static void *seq_start(struct seq_file *s, loff_t *pos)
- __acquires(rcu_bh)
+ __acquires(rcu)
{
- rcu_read_lock_bh();
+ rcu_read_lock();
return get_idx(seq_file_net(s), s->private, *pos);
}
@@ -1079,9 +1087,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
}
static void seq_stop(struct seq_file *s, void *v)
- __releases(rcu_bh)
+ __releases(rcu)
{
- rcu_read_unlock_bh();
+ rcu_read_unlock();
}
static int seq_show(struct seq_file *s, void *v)
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index ee6840bd5933..c0fc431991e8 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -315,6 +315,14 @@ static int nfnl_osf_add_callback(struct sk_buff *skb,
f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+ if (f->opt_num > ARRAY_SIZE(f->opt))
+ return -EINVAL;
+
+ if (!memchr(f->genre, 0, MAXGENRELEN) ||
+ !memchr(f->subtype, 0, MAXGENRELEN) ||
+ !memchr(f->version, 0, MAXGENRELEN))
+ return -EINVAL;
+
kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL);
if (!kf)
return -ENOMEM;
@@ -439,3 +447,5 @@ module_init(nfnl_osf_init);
module_exit(nfnl_osf_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Passive OS fingerprint matching");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 87a9009d5234..8b7b39d8a109 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -29,6 +29,8 @@
#include <linux/netfilter/nfnetlink_queue.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/list.h>
+#include <linux/cgroup-defs.h>
+#include <net/gso.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/netfilter/nf_queue.h>
@@ -167,7 +169,9 @@ instance_destroy_rcu(struct rcu_head *head)
struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance,
rcu);
+ rcu_read_lock();
nfqnl_flush(inst, NULL, 0);
+ rcu_read_unlock();
kfree(inst);
module_put(THIS_MODULE);
}
@@ -223,22 +227,174 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
return entry;
}
+static unsigned int nf_iterate(struct sk_buff *skb,
+ struct nf_hook_state *state,
+ const struct nf_hook_entries *hooks,
+ unsigned int *index)
+{
+ const struct nf_hook_entry *hook;
+ unsigned int verdict, i = *index;
+
+ while (i < hooks->num_hook_entries) {
+ hook = &hooks->hooks[i];
+repeat:
+ verdict = nf_hook_entry_hookfn(hook, skb, state);
+ if (verdict != NF_ACCEPT) {
+ *index = i;
+ if (verdict != NF_REPEAT)
+ return verdict;
+ goto repeat;
+ }
+ i++;
+ }
+
+ *index = i;
+ return NF_ACCEPT;
+}
+
+static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
+{
+ switch (pf) {
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ case NFPROTO_BRIDGE:
+ return rcu_dereference(net->nf.hooks_bridge[hooknum]);
+#endif
+ case NFPROTO_IPV4:
+ return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
+ case NFPROTO_IPV6:
+ return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
+ default:
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ return NULL;
+}
+
+static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
+{
+#ifdef CONFIG_INET
+ const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (!(iph->tos == rt_info->tos &&
+ skb->mark == rt_info->mark &&
+ iph->daddr == rt_info->daddr &&
+ iph->saddr == rt_info->saddr))
+ return ip_route_me_harder(entry->state.net, entry->state.sk,
+ skb, RTN_UNSPEC);
+ }
+#endif
+ return 0;
+}
+
+static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
+{
+ const struct nf_ipv6_ops *v6ops;
+ int ret = 0;
+
+ switch (entry->state.pf) {
+ case AF_INET:
+ ret = nf_ip_reroute(skb, entry);
+ break;
+ case AF_INET6:
+ v6ops = rcu_dereference(nf_ipv6_ops);
+ if (v6ops)
+ ret = v6ops->reroute(skb, entry);
+ break;
+ }
+ return ret;
+}
+
+/* caller must hold rcu read-side lock */
+static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+{
+ const struct nf_hook_entry *hook_entry;
+ const struct nf_hook_entries *hooks;
+ struct sk_buff *skb = entry->skb;
+ const struct net *net;
+ unsigned int i;
+ int err;
+ u8 pf;
+
+ net = entry->state.net;
+ pf = entry->state.pf;
+
+ hooks = nf_hook_entries_head(net, pf, entry->state.hook);
+
+ i = entry->hook_index;
+ if (!hooks || i >= hooks->num_hook_entries) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP);
+ nf_queue_entry_free(entry);
+ return;
+ }
+
+ hook_entry = &hooks->hooks[i];
+
+ /* Continue traversal iff userspace said ok... */
+ if (verdict == NF_REPEAT)
+ verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
+
+ if (verdict == NF_ACCEPT) {
+ if (nf_reroute(skb, entry) < 0)
+ verdict = NF_DROP;
+ }
+
+ if (verdict == NF_ACCEPT) {
+next_hook:
+ ++i;
+ verdict = nf_iterate(skb, &entry->state, hooks, &i);
+ }
+
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_STOP:
+ local_bh_disable();
+ entry->state.okfn(entry->state.net, entry->state.sk, skb);
+ local_bh_enable();
+ break;
+ case NF_QUEUE:
+ err = nf_queue(skb, &entry->state, i, verdict);
+ if (err == 1)
+ goto next_hook;
+ break;
+ case NF_STOLEN:
+ break;
+ default:
+ kfree_skb(skb);
+ }
+
+ nf_queue_entry_free(entry);
+}
+
static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
const struct nf_ct_hook *ct_hook;
- int err;
if (verdict == NF_ACCEPT ||
verdict == NF_REPEAT ||
verdict == NF_STOP) {
+ unsigned int ct_verdict = verdict;
+
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
- if (ct_hook) {
- err = ct_hook->update(entry->state.net, entry->skb);
- if (err < 0)
- verdict = NF_DROP;
- }
+ if (ct_hook)
+ ct_verdict = ct_hook->update(entry->state.net, entry->skb);
rcu_read_unlock();
+
+ switch (ct_verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ /* follow userspace verdict, could be REPEAT */
+ break;
+ case NF_STOLEN:
+ nf_queue_entry_free(entry);
+ return;
+ default:
+ verdict = ct_verdict & NF_VERDICT_MASK;
+ break;
+ }
}
nf_reinject(entry, verdict);
}
@@ -301,18 +457,31 @@ nla_put_failure:
return -1;
}
-static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
+static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk)
{
- u32 seclen = 0;
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+ if (sk && sk_fullsock(sk)) {
+ u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+
+ if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid)))
+ return -1;
+ }
+#endif
+ return 0;
+}
+
+static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx)
+{
+ int seclen = 0;
#if IS_ENABLED(CONFIG_NETWORK_SECMARK)
+
if (!skb || !sk_fullsock(skb->sk))
return 0;
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->secmark)
- security_secid_to_secctx(skb->secmark, secdata, &seclen);
-
+ seclen = security_secid_to_secctx(skb->secmark, ctx);
read_unlock_bh(&skb->sk->sk_callback_lock);
#endif
return seclen;
@@ -371,6 +540,14 @@ nla_put_failure:
return -1;
}
+static int nf_queue_checksum_help(struct sk_buff *entskb)
+{
+ if (skb_csum_is_sctp(entskb))
+ return skb_crc32c_csum_help(entskb);
+
+ return skb_checksum_help(entskb);
+}
+
static struct sk_buff *
nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct nf_queue_entry *entry,
@@ -390,8 +567,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
enum ip_conntrack_info ctinfo = 0;
const struct nfnl_ct_hook *nfnl_ct;
bool csum_verify;
- char *secdata = NULL;
- u32 seclen = 0;
+ struct lsm_context ctx = { NULL, 0, 0 };
+ int seclen = 0;
ktime_t tstamp;
size = nlmsg_total_size(sizeof(struct nfgenmsg))
@@ -406,6 +583,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ nla_total_size(sizeof(u_int32_t)) /* priority */
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
+ nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+ + nla_total_size(sizeof(u_int32_t)) /* classid */
+#endif
+ nla_total_size(sizeof(u_int32_t)); /* cap_len */
tstamp = skb_tstamp_cond(entskb, false);
@@ -430,7 +610,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
case NFQNL_COPY_PACKET:
if (!(queue->flags & NFQA_CFG_F_GSO) &&
entskb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(entskb))
+ nf_queue_checksum_help(entskb))
return NULL;
data_len = READ_ONCE(queue->copy_range);
@@ -462,7 +642,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) {
- seclen = nfqnl_get_sk_secctx(entskb, &secdata);
+ seclen = nfqnl_get_sk_secctx(entskb, &ctx);
+ if (seclen < 0)
+ return NULL;
if (seclen)
size += nla_total_size(seclen);
}
@@ -599,7 +781,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
goto nla_put_failure;
- if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata))
+ if (nfqnl_put_sk_classid(skb, entskb->sk) < 0)
+ goto nla_put_failure;
+
+ if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context))
goto nla_put_failure;
if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0)
@@ -627,8 +812,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
nlh->nlmsg_len = skb->len;
- if (seclen)
- security_release_secctx(secdata, seclen);
+ if (seclen >= 0)
+ security_release_secctx(&ctx);
return skb;
nla_put_failure:
@@ -636,8 +821,8 @@ nla_put_failure:
kfree_skb(skb);
net_err_ratelimited("nf_queue: error creating packet message\n");
nlmsg_failure:
- if (seclen)
- security_release_secctx(secdata, seclen);
+ if (seclen >= 0)
+ security_release_secctx(&ctx);
return NULL;
}
@@ -645,10 +830,41 @@ static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
- const struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+ struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+ unsigned long status;
+ unsigned int use;
- if (ct && ((ct->status & flags) == IPS_DYING))
+ if (!ct)
+ return false;
+
+ status = READ_ONCE(ct->status);
+ if ((status & flags) == IPS_DYING)
return true;
+
+ if (status & IPS_CONFIRMED)
+ return false;
+
+ /* in some cases skb_clone() can occur after initial conntrack
+ * pickup, but conntrack assumes exclusive skb->_nfct ownership for
+ * unconfirmed entries.
+ *
+ * This happens for br_netfilter and with ip multicast routing.
+ * We can't be solved with serialization here because one clone could
+ * have been queued for local delivery.
+ */
+ use = refcount_read(&ct->ct_general.use);
+ if (likely(use == 1))
+ return false;
+
+ /* Can't decrement further? Exclusive ownership. */
+ if (!refcount_dec_not_one(&ct->ct_general.use))
+ return false;
+
+ skb_set_nfct(entry->skb, 0);
+ /* No nf_ct_put(): we already decremented .use and it cannot
+ * drop down to 0.
+ */
+ return true;
#endif
return false;
}
@@ -808,7 +1024,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
break;
}
- if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb))
+ if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
return __nfqnl_enqueue_packet(net, queue, entry);
nf_bridge_adjust_skb_data(skb);
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 84eae7cabc67..d550910aabec 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -17,6 +17,7 @@
struct nft_bitwise {
u8 sreg;
+ u8 sreg2;
u8 dreg;
enum nft_bitwise_ops op:8;
u8 len;
@@ -25,8 +26,8 @@ struct nft_bitwise {
struct nft_data data;
};
-static void nft_bitwise_eval_bool(u32 *dst, const u32 *src,
- const struct nft_bitwise *priv)
+static void nft_bitwise_eval_mask_xor(u32 *dst, const u32 *src,
+ const struct nft_bitwise *priv)
{
unsigned int i;
@@ -60,38 +61,82 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src,
}
}
+static void nft_bitwise_eval_and(u32 *dst, const u32 *src, const u32 *src2,
+ const struct nft_bitwise *priv)
+{
+ unsigned int i, n;
+
+ for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+ dst[i] = src[i] & src2[i];
+}
+
+static void nft_bitwise_eval_or(u32 *dst, const u32 *src, const u32 *src2,
+ const struct nft_bitwise *priv)
+{
+ unsigned int i, n;
+
+ for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+ dst[i] = src[i] | src2[i];
+}
+
+static void nft_bitwise_eval_xor(u32 *dst, const u32 *src, const u32 *src2,
+ const struct nft_bitwise *priv)
+{
+ unsigned int i, n;
+
+ for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+ dst[i] = src[i] ^ src2[i];
+}
+
void nft_bitwise_eval(const struct nft_expr *expr,
struct nft_regs *regs, const struct nft_pktinfo *pkt)
{
const struct nft_bitwise *priv = nft_expr_priv(expr);
- const u32 *src = &regs->data[priv->sreg];
+ const u32 *src = &regs->data[priv->sreg], *src2;
u32 *dst = &regs->data[priv->dreg];
- switch (priv->op) {
- case NFT_BITWISE_BOOL:
- nft_bitwise_eval_bool(dst, src, priv);
- break;
- case NFT_BITWISE_LSHIFT:
+ if (priv->op == NFT_BITWISE_MASK_XOR) {
+ nft_bitwise_eval_mask_xor(dst, src, priv);
+ return;
+ }
+ if (priv->op == NFT_BITWISE_LSHIFT) {
nft_bitwise_eval_lshift(dst, src, priv);
- break;
- case NFT_BITWISE_RSHIFT:
+ return;
+ }
+ if (priv->op == NFT_BITWISE_RSHIFT) {
nft_bitwise_eval_rshift(dst, src, priv);
- break;
+ return;
+ }
+
+ src2 = priv->sreg2 ? &regs->data[priv->sreg2] : priv->data.data;
+
+ if (priv->op == NFT_BITWISE_AND) {
+ nft_bitwise_eval_and(dst, src, src2, priv);
+ return;
+ }
+ if (priv->op == NFT_BITWISE_OR) {
+ nft_bitwise_eval_or(dst, src, src2, priv);
+ return;
+ }
+ if (priv->op == NFT_BITWISE_XOR) {
+ nft_bitwise_eval_xor(dst, src, src2, priv);
+ return;
}
}
static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
[NFTA_BITWISE_SREG] = { .type = NLA_U32 },
+ [NFTA_BITWISE_SREG2] = { .type = NLA_U32 },
[NFTA_BITWISE_DREG] = { .type = NLA_U32 },
[NFTA_BITWISE_LEN] = { .type = NLA_U32 },
[NFTA_BITWISE_MASK] = { .type = NLA_NESTED },
[NFTA_BITWISE_XOR] = { .type = NLA_NESTED },
- [NFTA_BITWISE_OP] = { .type = NLA_U32 },
+ [NFTA_BITWISE_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_BITWISE_DATA] = { .type = NLA_NESTED },
};
-static int nft_bitwise_init_bool(struct nft_bitwise *priv,
- const struct nlattr *const tb[])
+static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv,
+ const struct nlattr *const tb[])
{
struct nft_data_desc mask = {
.type = NFT_DATA_VALUE,
@@ -105,7 +150,8 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv,
};
int err;
- if (tb[NFTA_BITWISE_DATA])
+ if (tb[NFTA_BITWISE_DATA] ||
+ tb[NFTA_BITWISE_SREG2])
return -EINVAL;
if (!tb[NFTA_BITWISE_MASK] ||
@@ -139,7 +185,8 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv,
int err;
if (tb[NFTA_BITWISE_MASK] ||
- tb[NFTA_BITWISE_XOR])
+ tb[NFTA_BITWISE_XOR] ||
+ tb[NFTA_BITWISE_SREG2])
return -EINVAL;
if (!tb[NFTA_BITWISE_DATA])
@@ -157,6 +204,41 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv,
return 0;
}
+static int nft_bitwise_init_bool(const struct nft_ctx *ctx,
+ struct nft_bitwise *priv,
+ const struct nlattr *const tb[])
+{
+ int err;
+
+ if (tb[NFTA_BITWISE_MASK] ||
+ tb[NFTA_BITWISE_XOR])
+ return -EINVAL;
+
+ if ((!tb[NFTA_BITWISE_DATA] && !tb[NFTA_BITWISE_SREG2]) ||
+ (tb[NFTA_BITWISE_DATA] && tb[NFTA_BITWISE_SREG2]))
+ return -EINVAL;
+
+ if (tb[NFTA_BITWISE_DATA]) {
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ .len = priv->len,
+ };
+
+ err = nft_data_init(NULL, &priv->data, &desc,
+ tb[NFTA_BITWISE_DATA]);
+ if (err < 0)
+ return err;
+ } else {
+ err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG2],
+ &priv->sreg2, priv->len);
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
static int nft_bitwise_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -171,7 +253,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
priv->len = len;
- err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg,
+ err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg,
priv->len);
if (err < 0)
return err;
@@ -185,32 +267,40 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
if (tb[NFTA_BITWISE_OP]) {
priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP]));
switch (priv->op) {
- case NFT_BITWISE_BOOL:
+ case NFT_BITWISE_MASK_XOR:
case NFT_BITWISE_LSHIFT:
case NFT_BITWISE_RSHIFT:
+ case NFT_BITWISE_AND:
+ case NFT_BITWISE_OR:
+ case NFT_BITWISE_XOR:
break;
default:
return -EOPNOTSUPP;
}
} else {
- priv->op = NFT_BITWISE_BOOL;
+ priv->op = NFT_BITWISE_MASK_XOR;
}
switch(priv->op) {
- case NFT_BITWISE_BOOL:
- err = nft_bitwise_init_bool(priv, tb);
+ case NFT_BITWISE_MASK_XOR:
+ err = nft_bitwise_init_mask_xor(priv, tb);
break;
case NFT_BITWISE_LSHIFT:
case NFT_BITWISE_RSHIFT:
err = nft_bitwise_init_shift(priv, tb);
break;
+ case NFT_BITWISE_AND:
+ case NFT_BITWISE_OR:
+ case NFT_BITWISE_XOR:
+ err = nft_bitwise_init_bool(ctx, priv, tb);
+ break;
}
return err;
}
-static int nft_bitwise_dump_bool(struct sk_buff *skb,
- const struct nft_bitwise *priv)
+static int nft_bitwise_dump_mask_xor(struct sk_buff *skb,
+ const struct nft_bitwise *priv)
{
if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
NFT_DATA_VALUE, priv->len) < 0)
@@ -232,6 +322,21 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb,
return 0;
}
+static int nft_bitwise_dump_bool(struct sk_buff *skb,
+ const struct nft_bitwise *priv)
+{
+ if (priv->sreg2) {
+ if (nft_dump_register(skb, NFTA_BITWISE_SREG2, priv->sreg2))
+ return -1;
+ } else {
+ if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
static int nft_bitwise_dump(struct sk_buff *skb,
const struct nft_expr *expr, bool reset)
{
@@ -248,13 +353,18 @@ static int nft_bitwise_dump(struct sk_buff *skb,
return -1;
switch (priv->op) {
- case NFT_BITWISE_BOOL:
- err = nft_bitwise_dump_bool(skb, priv);
+ case NFT_BITWISE_MASK_XOR:
+ err = nft_bitwise_dump_mask_xor(skb, priv);
break;
case NFT_BITWISE_LSHIFT:
case NFT_BITWISE_RSHIFT:
err = nft_bitwise_dump_shift(skb, priv);
break;
+ case NFT_BITWISE_AND:
+ case NFT_BITWISE_OR:
+ case NFT_BITWISE_XOR:
+ err = nft_bitwise_dump_bool(skb, priv);
+ break;
}
return err;
@@ -269,7 +379,7 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx,
const struct nft_bitwise *priv = nft_expr_priv(expr);
struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
- if (priv->op != NFT_BITWISE_BOOL)
+ if (priv->op != NFT_BITWISE_MASK_XOR)
return -EOPNOTSUPP;
if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) ||
@@ -299,6 +409,7 @@ static bool nft_bitwise_reduce(struct nft_regs_track *track,
track->regs[priv->dreg].bitwise &&
track->regs[priv->dreg].bitwise->ops == expr->ops &&
priv->sreg == bitwise->sreg &&
+ priv->sreg2 == bitwise->sreg2 &&
priv->dreg == bitwise->dreg &&
priv->op == bitwise->op &&
priv->len == bitwise->len &&
@@ -323,7 +434,7 @@ static bool nft_bitwise_reduce(struct nft_regs_track *track,
dreg = priv->dreg;
regcount = DIV_ROUND_UP(priv->len, NFT_REG32_SIZE);
for (i = 0; i < regcount; i++, dreg++)
- track->regs[priv->dreg].bitwise = expr;
+ track->regs[dreg].bitwise = expr;
return false;
}
@@ -365,7 +476,7 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
int err;
- err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg,
+ err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg,
sizeof(u32));
if (err < 0)
return err;
@@ -375,7 +486,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- if (tb[NFTA_BITWISE_DATA])
+ if (tb[NFTA_BITWISE_DATA] ||
+ tb[NFTA_BITWISE_SREG2])
return -EINVAL;
if (!tb[NFTA_BITWISE_MASK] ||
@@ -406,7 +518,7 @@ nft_bitwise_fast_dump(struct sk_buff *skb,
return -1;
if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32))))
return -1;
- if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_BOOL)))
+ if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_MASK_XOR)))
return -1;
data.data[0] = priv->mask;
@@ -501,7 +613,7 @@ nft_bitwise_select_ops(const struct nft_ctx *ctx,
return &nft_bitwise_ops;
if (tb[NFTA_BITWISE_OP] &&
- ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_BOOL)
+ ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_MASK_XOR)
return &nft_bitwise_ops;
return &nft_bitwise_fast_ops;
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index b66647a5a171..af9206a3afd1 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -5,7 +5,7 @@
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -30,21 +30,22 @@ void nft_byteorder_eval(const struct nft_expr *expr,
const struct nft_byteorder *priv = nft_expr_priv(expr);
u32 *src = &regs->data[priv->sreg];
u32 *dst = &regs->data[priv->dreg];
- union { u32 u32; u16 u16; } *s, *d;
+ u16 *s16, *d16;
unsigned int i;
- s = (void *)src;
- d = (void *)dst;
+ s16 = (void *)src;
+ d16 = (void *)dst;
switch (priv->size) {
case 8: {
+ u64 *dst64 = (void *)dst;
u64 src64;
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
src64 = nft_reg_load64(&src[i]);
- nft_reg_store64(&dst[i],
+ nft_reg_store64(&dst64[i],
be64_to_cpu((__force __be64)src64));
}
break;
@@ -52,7 +53,7 @@ void nft_byteorder_eval(const struct nft_expr *expr,
for (i = 0; i < priv->len / 8; i++) {
src64 = (__force __u64)
cpu_to_be64(nft_reg_load64(&src[i]));
- nft_reg_store64(&dst[i], src64);
+ nft_reg_store64(&dst64[i], src64);
}
break;
}
@@ -62,11 +63,11 @@ void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 4; i++)
- d[i].u32 = ntohl((__force __be32)s[i].u32);
+ dst[i] = ntohl((__force __be32)src[i]);
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 4; i++)
- d[i].u32 = (__force __u32)htonl(s[i].u32);
+ dst[i] = (__force __u32)htonl(src[i]);
break;
}
break;
@@ -74,11 +75,11 @@ void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 2; i++)
- d[i].u16 = ntohs((__force __be16)s[i].u16);
+ d16[i] = ntohs((__force __be16)s16[i]);
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 2; i++)
- d[i].u16 = (__force __u16)htons(s[i].u16);
+ d16[i] = (__force __u16)htons(s16[i]);
break;
}
break;
@@ -88,9 +89,9 @@ void nft_byteorder_eval(const struct nft_expr *expr,
static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = {
[NFTA_BYTEORDER_SREG] = { .type = NLA_U32 },
[NFTA_BYTEORDER_DREG] = { .type = NLA_U32 },
- [NFTA_BYTEORDER_OP] = { .type = NLA_U32 },
- [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 },
- [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_BYTEORDER_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_BYTEORDER_SIZE] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_byteorder_init(const struct nft_ctx *ctx,
@@ -138,7 +139,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
priv->len = len;
- err = nft_parse_register_load(tb[NFTA_BYTEORDER_SREG], &priv->sreg,
+ err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg,
priv->len);
if (err < 0)
return err;
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index c3563f0be269..b16185e9a6dd 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -318,33 +318,85 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
},
};
-static void nft_netdev_event(unsigned long event, struct net_device *dev,
- struct nft_ctx *ctx)
+static int nft_netdev_event(unsigned long event, struct net_device *dev,
+ struct nft_base_chain *basechain, bool changename)
{
- struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
- struct nft_hook *hook, *found = NULL;
- int n = 0;
-
- if (event != NETDEV_UNREGISTER)
- return;
+ struct nft_table *table = basechain->chain.table;
+ struct nf_hook_ops *ops;
+ struct nft_hook *hook;
+ bool match;
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (hook->ops.dev == dev)
- found = hook;
+ ops = nft_hook_find_ops(hook, dev);
+ match = !strncmp(hook->ifname, dev->name, hook->ifnamelen);
- n++;
- }
- if (!found)
- return;
-
- if (n > 1) {
- nf_unregister_net_hook(ctx->net, &found->ops);
- list_del_rcu(&found->list);
- kfree_rcu(found, rcu);
- return;
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* NOP if not found or new name still matching */
+ if (!ops || (changename && match))
+ continue;
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT))
+ nf_unregister_net_hook(dev_net(dev), ops);
+
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+ break;
+ case NETDEV_REGISTER:
+ /* NOP if not matching or already registered */
+ if (!match || (changename && ops))
+ continue;
+
+ ops = kmemdup(&basechain->ops,
+ sizeof(struct nf_hook_ops),
+ GFP_KERNEL_ACCOUNT);
+ if (!ops)
+ return 1;
+
+ ops->dev = dev;
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ nf_register_net_hook(dev_net(dev), ops)) {
+ kfree(ops);
+ return 1;
+ }
+ list_add_tail_rcu(&ops->list, &hook->ops_list);
+ break;
+ }
+ break;
}
+ return 0;
+}
+
+static int __nf_tables_netdev_event(unsigned long event,
+ struct net_device *dev,
+ bool changename)
+{
+ struct nft_base_chain *basechain;
+ struct nftables_pernet *nft_net;
+ struct nft_chain *chain;
+ struct nft_table *table;
+
+ nft_net = nft_pernet(dev_net(dev));
+ list_for_each_entry(table, &nft_net->tables, list) {
+ if (table->family != NFPROTO_NETDEV &&
+ table->family != NFPROTO_INET)
+ continue;
- __nft_release_basechain(ctx);
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_base_chain(chain))
+ continue;
+
+ basechain = nft_base_chain(chain);
+ if (table->family == NFPROTO_INET &&
+ basechain->ops.hooknum != NF_INET_INGRESS)
+ continue;
+
+ if (nft_netdev_event(event, dev, basechain, changename))
+ return 1;
+ }
+ }
+ return 0;
}
static int nf_tables_netdev_event(struct notifier_block *this,
@@ -352,38 +404,28 @@ static int nf_tables_netdev_event(struct notifier_block *this,
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct nftables_pernet *nft_net;
- struct nft_table *table;
- struct nft_chain *chain, *nr;
- struct nft_ctx ctx = {
- .net = dev_net(dev),
- };
+ int ret = NOTIFY_DONE;
- if (event != NETDEV_UNREGISTER &&
+ if (event != NETDEV_REGISTER &&
+ event != NETDEV_UNREGISTER &&
event != NETDEV_CHANGENAME)
return NOTIFY_DONE;
- if (!check_net(ctx.net))
- return NOTIFY_DONE;
-
- nft_net = nft_pernet(ctx.net);
+ nft_net = nft_pernet(dev_net(dev));
mutex_lock(&nft_net->commit_mutex);
- list_for_each_entry(table, &nft_net->tables, list) {
- if (table->family != NFPROTO_NETDEV)
- continue;
- ctx.family = table->family;
- ctx.table = table;
- list_for_each_entry_safe(chain, nr, &table->chains, list) {
- if (!nft_is_base_chain(chain))
- continue;
-
- ctx.chain = chain;
- nft_netdev_event(event, dev, &ctx);
+ if (event == NETDEV_CHANGENAME) {
+ if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) {
+ ret = NOTIFY_BAD;
+ goto out_unlock;
}
+ __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true);
+ } else if (__nf_tables_netdev_event(event, dev, false)) {
+ ret = NOTIFY_BAD;
}
+out_unlock:
mutex_unlock(&nft_net->commit_mutex);
-
- return NOTIFY_DONE;
+ return ret;
}
static struct notifier_block nf_tables_netdev_notifier = {
diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c
index 98e4946100c5..40e230d8b712 100644
--- a/net/netfilter/nft_chain_nat.c
+++ b/net/netfilter/nft_chain_nat.c
@@ -137,6 +137,7 @@ module_init(nft_chain_nat_init);
module_exit(nft_chain_nat_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("nftables network address translation support");
#ifdef CONFIG_NF_TABLES_IPV4
MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
#endif
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 6eb21a4f5698..2605f43737bc 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -83,7 +83,7 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
if (err < 0)
return err;
- err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
+ err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
if (err < 0)
return err;
@@ -162,7 +162,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
memcpy(key + reg->offset, data, reg->len);
memcpy(mask + reg->offset, datamask, reg->len);
- flow->match.dissector.used_keys |= BIT(reg->key);
+ flow->match.dissector.used_keys |= BIT_ULL(reg->key);
flow->match.dissector.offset[reg->key] = reg->base_offset;
if (reg->key == FLOW_DISSECTOR_KEY_META &&
@@ -222,7 +222,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
+ err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
if (err < 0)
return err;
@@ -323,7 +323,7 @@ static int nft_cmp16_fast_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
+ err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
if (err < 0)
return err;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 5284cd2ad532..72711d62fddf 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -135,7 +135,7 @@ static void nft_target_eval_bridge(const struct nft_expr *expr,
static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = {
[NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING },
- [NFTA_TARGET_REV] = { .type = NLA_U32 },
+ [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_TARGET_INFO] = { .type = NLA_BINARY },
};
@@ -200,6 +200,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1]
static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
{
struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1];
+ u32 l4proto;
u32 flags;
int err;
@@ -212,16 +213,22 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
return -EINVAL;
flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS]));
- if (flags & ~NFT_RULE_COMPAT_F_MASK)
+ if (flags & NFT_RULE_COMPAT_F_UNUSED ||
+ flags & ~NFT_RULE_COMPAT_F_MASK)
return -EINVAL;
if (flags & NFT_RULE_COMPAT_F_INV)
*inv = true;
- *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+ l4proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+ if (l4proto > U16_MAX)
+ return -EINVAL;
+
+ *proto = l4proto;
+
return 0;
}
-static void nft_compat_wait_for_destructors(void)
+static void nft_compat_wait_for_destructors(struct net *net)
{
/* xtables matches or targets can have side effects, e.g.
* creation/destruction of /proc files.
@@ -229,7 +236,7 @@ static void nft_compat_wait_for_destructors(void)
* work queue. If we have pending invocations we thus
* need to wait for those to finish.
*/
- nf_tables_trans_destroy_flush_work();
+ nf_tables_trans_destroy_flush_work(net);
}
static int
@@ -255,7 +262,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
- nft_compat_wait_for_destructors();
+ nft_compat_wait_for_destructors(ctx->net);
ret = xt_check_target(&par, size, proto, inv);
if (ret < 0) {
@@ -343,13 +350,28 @@ nla_put_failure:
}
static int nft_target_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct xt_target *target = expr->ops->data;
unsigned int hook_mask = 0;
int ret;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET &&
+ ctx->family != NFPROTO_BRIDGE &&
+ ctx->family != NFPROTO_ARP)
+ return -EOPNOTSUPP;
+
+ ret = nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+ if (ret)
+ return ret;
+
if (nft_is_base_chain(ctx->chain)) {
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
@@ -413,7 +435,7 @@ static void nft_match_eval(const struct nft_expr *expr,
static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
[NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING },
- [NFTA_MATCH_REV] = { .type = NLA_U32 },
+ [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_MATCH_INFO] = { .type = NLA_BINARY },
};
@@ -493,7 +515,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
- nft_compat_wait_for_destructors();
+ nft_compat_wait_for_destructors(ctx->net);
return xt_check_match(&par, size, proto, inv);
}
@@ -513,7 +535,7 @@ nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
struct xt_match *m = expr->ops->data;
int ret;
- priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL);
+ priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT);
if (!priv->info)
return -ENOMEM;
@@ -588,13 +610,28 @@ static int nft_match_large_dump(struct sk_buff *skb,
}
static int nft_match_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct xt_match *match = expr->ops->data;
unsigned int hook_mask = 0;
int ret;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET &&
+ ctx->family != NFPROTO_BRIDGE &&
+ ctx->family != NFPROTO_ARP)
+ return -EOPNOTSUPP;
+
+ ret = nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+ if (ret)
+ return ret;
+
if (nft_is_base_chain(ctx->chain)) {
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
@@ -712,7 +749,7 @@ out_put:
static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
[NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING,
.len = NFT_COMPAT_NAME_MAX-1 },
- [NFTA_COMPAT_REV] = { .type = NLA_U32 },
+ [NFTA_COMPAT_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_COMPAT_TYPE] = { .type = NLA_U32 },
};
@@ -771,7 +808,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
goto err;
}
- ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL);
+ ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT);
if (!ops) {
err = -ENOMEM;
goto err;
@@ -861,7 +898,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
goto err;
}
- ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL);
+ ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT);
if (!ops) {
err = -ENOMEM;
goto err;
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index de9d1980df69..657764774a2d 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -24,33 +24,27 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
const struct nft_pktinfo *pkt,
const struct nft_set_ext *ext)
{
- const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
- const struct nf_conntrack_tuple *tuple_ptr;
- struct nf_conntrack_tuple tuple;
- enum ip_conntrack_info ctinfo;
- const struct nf_conn *ct;
unsigned int count;
+ int err;
- tuple_ptr = &tuple;
-
- ct = nf_ct_get(pkt->skb, &ctinfo);
- if (ct != NULL) {
- tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- zone = nf_ct_zone(ct);
- } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb),
- nft_pf(pkt), nft_net(pkt), &tuple)) {
- regs->verdict.code = NF_DROP;
- return;
- }
-
- if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) {
- regs->verdict.code = NF_DROP;
- return;
+ err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list);
+ if (err) {
+ if (err == -EEXIST) {
+ /* Call gc to update the list count if any connection has
+ * been closed already. This is useful for softlimit
+ * connections like limiting bandwidth based on a number
+ * of open connections.
+ */
+ nf_conncount_gc_list(nft_net(pkt), priv->list);
+ } else {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
}
- count = priv->list->count;
+ count = READ_ONCE(priv->list->count);
- if ((count > priv->limit) ^ priv->invert) {
+ if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -137,6 +131,16 @@ static int nft_connlimit_obj_init(const struct nft_ctx *ctx,
return nft_connlimit_do_init(ctx, tb, priv);
}
+static void nft_connlimit_obj_update(struct nft_object *obj,
+ struct nft_object *newobj)
+{
+ struct nft_connlimit *newpriv = nft_obj_data(newobj);
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ WRITE_ONCE(priv->limit, newpriv->limit);
+ WRITE_ONCE(priv->invert, newpriv->invert);
+}
+
static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx,
struct nft_object *obj)
{
@@ -166,6 +170,7 @@ static const struct nft_object_ops nft_connlimit_obj_ops = {
.init = nft_connlimit_obj_init,
.destroy = nft_connlimit_obj_destroy,
.dump = nft_connlimit_obj_dump,
+ .update = nft_connlimit_obj_update,
};
static struct nft_object_type nft_connlimit_obj_type __read_mostly = {
@@ -210,12 +215,12 @@ static void nft_connlimit_destroy(const struct nft_ctx *ctx,
nft_connlimit_do_destroy(ctx, priv);
}
-static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_connlimit *priv_dst = nft_expr_priv(dst);
struct nft_connlimit *priv_src = nft_expr_priv(src);
- priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC);
+ priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp);
if (!priv_dst->list)
return -ENOMEM;
@@ -238,13 +243,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- bool ret;
-
- local_bh_disable();
- ret = nf_conncount_gc_list(net, priv->list);
- local_bh_enable();
- return ret;
+ return nf_conncount_gc_list(net, priv->list);
}
static struct nft_expr_type nft_connlimit_type;
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index dccc68a5135a..cc7325329496 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -8,7 +8,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/seqlock.h>
+#include <linux/u64_stats_sync.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
@@ -17,6 +17,11 @@
#include <net/netfilter/nf_tables_offload.h>
struct nft_counter {
+ u64_stats_t bytes;
+ u64_stats_t packets;
+};
+
+struct nft_counter_tot {
s64 bytes;
s64 packets;
};
@@ -25,25 +30,24 @@ struct nft_counter_percpu_priv {
struct nft_counter __percpu *counter;
};
-static DEFINE_PER_CPU(seqcount_t, nft_counter_seq);
+static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync);
static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
+ struct u64_stats_sync *nft_sync;
struct nft_counter *this_cpu;
- seqcount_t *myseq;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- myseq = this_cpu_ptr(&nft_counter_seq);
-
- write_seqcount_begin(myseq);
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
- this_cpu->bytes += pkt->skb->len;
- this_cpu->packets++;
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->bytes, pkt->skb->len);
+ u64_stats_inc(&this_cpu->packets);
+ u64_stats_update_end(nft_sync);
- write_seqcount_end(myseq);
local_bh_enable();
}
@@ -66,17 +70,16 @@ static int nft_counter_do_init(const struct nlattr * const tb[],
if (cpu_stats == NULL)
return -ENOMEM;
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
+ this_cpu = raw_cpu_ptr(cpu_stats);
if (tb[NFTA_COUNTER_PACKETS]) {
- this_cpu->packets =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ u64_stats_set(&this_cpu->packets,
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])));
}
if (tb[NFTA_COUNTER_BYTES]) {
- this_cpu->bytes =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ u64_stats_set(&this_cpu->bytes,
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])));
}
- preempt_enable();
+
priv->counter = cpu_stats;
return 0;
}
@@ -104,35 +107,41 @@ static void nft_counter_obj_destroy(const struct nft_ctx *ctx,
}
static void nft_counter_reset(struct nft_counter_percpu_priv *priv,
- struct nft_counter *total)
+ struct nft_counter_tot *total)
{
+ struct u64_stats_sync *nft_sync;
struct nft_counter *this_cpu;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- this_cpu->packets -= total->packets;
- this_cpu->bytes -= total->bytes;
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
+
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->packets, -total->packets);
+ u64_stats_add(&this_cpu->bytes, -total->bytes);
+ u64_stats_update_end(nft_sync);
+
local_bh_enable();
}
static void nft_counter_fetch(struct nft_counter_percpu_priv *priv,
- struct nft_counter *total)
+ struct nft_counter_tot *total)
{
struct nft_counter *this_cpu;
- const seqcount_t *myseq;
u64 bytes, packets;
unsigned int seq;
int cpu;
memset(total, 0, sizeof(*total));
for_each_possible_cpu(cpu) {
- myseq = per_cpu_ptr(&nft_counter_seq, cpu);
+ struct u64_stats_sync *nft_sync = per_cpu_ptr(&nft_counter_sync, cpu);
+
this_cpu = per_cpu_ptr(priv->counter, cpu);
do {
- seq = read_seqcount_begin(myseq);
- bytes = this_cpu->bytes;
- packets = this_cpu->packets;
- } while (read_seqcount_retry(myseq, seq));
+ seq = u64_stats_fetch_begin(nft_sync);
+ bytes = u64_stats_read(&this_cpu->bytes);
+ packets = u64_stats_read(&this_cpu->packets);
+ } while (u64_stats_fetch_retry(nft_sync, seq));
total->bytes += bytes;
total->packets += packets;
@@ -143,7 +152,7 @@ static int nft_counter_do_dump(struct sk_buff *skb,
struct nft_counter_percpu_priv *priv,
bool reset)
{
- struct nft_counter total;
+ struct nft_counter_tot total;
nft_counter_fetch(priv, &total);
@@ -226,25 +235,23 @@ static void nft_counter_destroy(const struct nft_ctx *ctx,
nft_counter_do_destroy(priv);
}
-static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
struct nft_counter __percpu *cpu_stats;
struct nft_counter *this_cpu;
- struct nft_counter total;
+ struct nft_counter_tot total;
nft_counter_fetch(priv, &total);
- cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp);
if (cpu_stats == NULL)
return -ENOMEM;
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
- this_cpu->packets = total.packets;
- this_cpu->bytes = total.bytes;
- preempt_enable();
+ this_cpu = raw_cpu_ptr(cpu_stats);
+ u64_stats_set(&this_cpu->packets, total.packets);
+ u64_stats_set(&this_cpu->bytes, total.bytes);
priv_clone->counter = cpu_stats;
return 0;
@@ -262,18 +269,18 @@ static void nft_counter_offload_stats(struct nft_expr *expr,
const struct flow_stats *stats)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct u64_stats_sync *nft_sync;
struct nft_counter *this_cpu;
- seqcount_t *myseq;
- preempt_disable();
+ local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- myseq = this_cpu_ptr(&nft_counter_seq);
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
- write_seqcount_begin(myseq);
- this_cpu->packets += stats->pkts;
- this_cpu->bytes += stats->bytes;
- write_seqcount_end(myseq);
- preempt_enable();
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->packets, stats->pkts);
+ u64_stats_add(&this_cpu->bytes, stats->bytes);
+ u64_stats_update_end(nft_sync);
+ local_bh_enable();
}
void nft_counter_init_seqcount(void)
@@ -281,7 +288,7 @@ void nft_counter_init_seqcount(void)
int cpu;
for_each_possible_cpu(cpu)
- seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
+ u64_stats_init(per_cpu_ptr(&nft_counter_sync, cpu));
}
struct nft_expr_type nft_counter_type;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index c68e2151defe..6f2ae7cad731 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -12,7 +12,7 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_tuple.h>
@@ -22,16 +22,7 @@
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
-
-struct nft_ct {
- enum nft_ct_keys key:8;
- enum ip_conntrack_dir dir:8;
- u8 len;
- union {
- u8 dreg;
- u8 sreg;
- };
-};
+#include <net/netfilter/nf_conntrack_seqadj.h>
struct nft_ct_helper_obj {
struct nf_conntrack_helper *helper4;
@@ -118,7 +109,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
helper = rcu_dereference(help->helper);
if (helper == NULL)
goto err;
- strncpy((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN);
+ strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN);
return;
#ifdef CONFIG_NF_CONNTRACK_LABELS
case NFT_CT_LABELS: {
@@ -240,6 +231,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
enum ip_conntrack_info ctinfo;
u16 value = nft_reg_load16(&regs->data[priv->sreg]);
struct nf_conn *ct;
+ int oldcnt;
ct = nf_ct_get(skb, &ctinfo);
if (ct) /* already tracked */
@@ -260,10 +252,11 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
ct = this_cpu_read(nft_ct_pcpu_template);
- if (likely(refcount_read(&ct->ct_general.use) == 1)) {
- refcount_inc(&ct->ct_general.use);
+ __refcount_inc(&ct->ct_general.use, &oldcnt);
+ if (likely(oldcnt == 1)) {
nf_ct_zone_add(ct, &zone);
} else {
+ refcount_dec(&ct->ct_general.use);
/* previous skb got queued to userspace, allocate temporary
* one until percpu template can be reused.
*/
@@ -272,6 +265,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
regs->verdict.code = NF_DROP;
return;
}
+ __set_bit(IPS_CONFIRMED_BIT, &ct->status);
}
nf_ct_set(skb, ct, IP_CT_NEW);
@@ -342,7 +336,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
[NFTA_CT_DREG] = { .type = NLA_U32 },
- [NFTA_CT_KEY] = { .type = NLA_U32 },
+ [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_CT_DIRECTION] = { .type = NLA_U8 },
[NFTA_CT_SREG] = { .type = NLA_U32 },
};
@@ -378,6 +372,7 @@ static bool nft_ct_tmpl_alloc_pcpu(void)
return false;
}
+ __set_bit(IPS_CONFIRMED_BIT, &tmp->status);
per_cpu(nft_ct_pcpu_template, cpu) = tmp;
}
@@ -385,6 +380,14 @@ static bool nft_ct_tmpl_alloc_pcpu(void)
}
#endif
+static void __nft_ct_get_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
+{
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ if (priv->key == NFT_CT_LABELS)
+ nf_connlabels_put(ctx->net);
+#endif
+}
+
static int nft_ct_get_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -419,6 +422,10 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
if (tb[NFTA_CT_DIRECTION] != NULL)
return -EINVAL;
len = NF_CT_LABELS_MAX_SIZE;
+
+ err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
+ if (err)
+ return err;
break;
#endif
case NFT_CT_HELPER:
@@ -484,6 +491,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
break;
#endif
case NFT_CT_ID:
+ if (tb[NFTA_CT_DIRECTION])
+ return -EINVAL;
+
len = sizeof(u32);
break;
default:
@@ -497,7 +507,8 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
case IP_CT_DIR_REPLY:
break;
default:
- return -EINVAL;
+ err = -EINVAL;
+ goto err;
}
}
@@ -505,11 +516,11 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL,
NFT_DATA_VALUE, len);
if (err < 0)
- return err;
+ goto err;
err = nf_ct_netns_get(ctx->net, ctx->family);
if (err < 0)
- return err;
+ goto err;
if (priv->key == NFT_CT_BYTES ||
priv->key == NFT_CT_PKTS ||
@@ -517,6 +528,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
nf_ct_set_acct(ctx->net, true);
return 0;
+err:
+ __nft_ct_get_destroy(ctx, priv);
+ return err;
}
static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
@@ -611,7 +625,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
}
priv->len = len;
- err = nft_parse_register_load(tb[NFTA_CT_SREG], &priv->sreg, len);
+ err = nft_parse_register_load(ctx, tb[NFTA_CT_SREG], &priv->sreg, len);
if (err < 0)
goto err1;
@@ -629,6 +643,9 @@ err1:
static void nft_ct_get_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
+ struct nft_ct *priv = nft_expr_priv(expr);
+
+ __nft_ct_get_destroy(ctx, priv);
nf_ct_netns_put(ctx->net, ctx->family);
}
@@ -759,6 +776,18 @@ static bool nft_ct_set_reduce(struct nft_regs_track *track,
return false;
}
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static const struct nft_expr_ops nft_ct_get_fast_ops = {
+ .type = &nft_ct_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+ .eval = nft_ct_get_fast_eval,
+ .init = nft_ct_get_init,
+ .destroy = nft_ct_get_destroy,
+ .dump = nft_ct_get_dump,
+ .reduce = nft_ct_set_reduce,
+};
+#endif
+
static const struct nft_expr_ops nft_ct_set_ops = {
.type = &nft_ct_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
@@ -791,8 +820,21 @@ nft_ct_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG])
return ERR_PTR(-EINVAL);
- if (tb[NFTA_CT_DREG])
+ if (tb[NFTA_CT_DREG]) {
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+
+ switch (k) {
+ case NFT_CT_STATE:
+ case NFT_CT_DIRECTION:
+ case NFT_CT_STATUS:
+ case NFT_CT_MARK:
+ case NFT_CT_SECMARK:
+ return &nft_ct_get_fast_ops;
+ }
+#endif
return &nft_ct_get_ops;
+ }
if (tb[NFTA_CT_SREG]) {
#ifdef CONFIG_NF_CONNTRACK_ZONES
@@ -909,7 +951,7 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj,
*/
values = nf_ct_timeout_data(timeout);
if (values)
- nf_ct_refresh(ct, pkt->skb, values[0]);
+ nf_ct_refresh(ct, values[0]);
}
static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
@@ -1151,6 +1193,10 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj,
if (help) {
rcu_assign_pointer(help->helper, to_assign);
set_bit(IPS_HELPER_BIT, &ct->status);
+
+ if ((ct->status & IPS_NAT_MASK) && !nfct_seqadj(ct))
+ if (!nfct_seqadj_ext_add(ct))
+ regs->verdict.code = NF_DROP;
}
}
@@ -1233,7 +1279,30 @@ static int nft_ct_expect_obj_init(const struct nft_ctx *ctx,
if (tb[NFTA_CT_EXPECT_L3PROTO])
priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO]));
+ switch (priv->l3num) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ if (priv->l3num == ctx->family || ctx->family == NFPROTO_INET)
+ break;
+
+ return -EINVAL;
+ case NFPROTO_INET: /* tuple.src.l3num supports NFPROTO_IPV4/6 only */
+ default:
+ return -EAFNOSUPPORT;
+ }
+
priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]);
+ switch (priv->l4proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_DCCP:
+ case IPPROTO_SCTP:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]);
priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]);
priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]);
diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c
new file mode 100644
index 000000000000..e684c8a91848
--- /dev/null
+++ b/net/netfilter/nft_ct_fast.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#if IS_ENABLED(CONFIG_NFT_CT)
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack.h>
+
+void nft_ct_get_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ unsigned int state;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+
+ switch (priv->key) {
+ case NFT_CT_STATE:
+ if (ct)
+ state = NF_CT_STATE_BIT(ctinfo);
+ else if (ctinfo == IP_CT_UNTRACKED)
+ state = NF_CT_STATE_UNTRACKED_BIT;
+ else
+ state = NF_CT_STATE_INVALID_BIT;
+ *dest = state;
+ return;
+ default:
+ break;
+ }
+
+ if (!ct) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ switch (priv->key) {
+ case NFT_CT_DIRECTION:
+ nft_reg_store8(dest, CTINFO2DIR(ctinfo));
+ return;
+ case NFT_CT_STATUS:
+ *dest = ct->status;
+ return;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ *dest = ct->mark;
+ return;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+ *dest = ct->secmark;
+ return;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ regs->verdict.code = NFT_BREAK;
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_ct_get_fast_eval);
+#endif
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index e5739a59ebf1..0573f96ce079 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -40,7 +40,7 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_DEV] == NULL)
return -EINVAL;
- return nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev,
+ return nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev,
sizeof(int));
}
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 274579b1696e..7807d8129664 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -35,7 +35,7 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv,
for (i = 0; i < priv->num_exprs; i++) {
expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
- if (nft_expr_clone(expr, priv->expr_array[i]) < 0)
+ if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0)
return -1;
elem_expr->size += priv->expr_array[i]->ops->size;
@@ -44,33 +44,34 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv,
return 0;
}
-static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
- struct nft_regs *regs)
+struct nft_elem_priv *nft_dynset_new(struct nft_set *set,
+ const struct nft_expr *expr,
+ struct nft_regs *regs)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
struct nft_set_ext *ext;
+ void *elem_priv;
u64 timeout;
- void *elem;
if (!atomic_add_unless(&set->nelems, 1, set->size))
return NULL;
- timeout = priv->timeout ? : set->timeout;
- elem = nft_set_elem_init(set, &priv->tmpl,
- &regs->data[priv->sreg_key], NULL,
- &regs->data[priv->sreg_data],
- timeout, 0, GFP_ATOMIC);
- if (IS_ERR(elem))
+ timeout = priv->timeout ? : READ_ONCE(set->timeout);
+ elem_priv = nft_set_elem_init(set, &priv->tmpl,
+ &regs->data[priv->sreg_key], NULL,
+ &regs->data[priv->sreg_data],
+ timeout, 0, GFP_ATOMIC);
+ if (IS_ERR(elem_priv))
goto err1;
- ext = nft_set_elem_ext(set, elem);
+ ext = nft_set_elem_ext(set, elem_priv);
if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0)
goto err2;
- return elem;
+ return elem_priv;
err2:
- nft_set_elem_destroy(set, elem, false);
+ nft_set_elem_destroy(set, elem_priv, false);
err1:
if (set->size)
atomic_dec(&set->nelems);
@@ -90,12 +91,13 @@ void nft_dynset_eval(const struct nft_expr *expr,
return;
}
- if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
- expr, regs, &ext)) {
+ ext = set->ops->update(set, &regs->data[priv->sreg_key], expr, regs);
+ if (ext) {
if (priv->op == NFT_DYNSET_OP_UPDATE &&
- nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
- timeout = priv->timeout ? : set->timeout;
- *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout;
+ nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
+ READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) {
+ timeout = priv->timeout ? : READ_ONCE(set->timeout);
+ WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + timeout);
}
nft_set_elem_update_expr(ext, regs, pkt);
@@ -148,7 +150,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
[NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_DYNSET_SET_ID] = { .type = NLA_U32 },
- [NFTA_DYNSET_OP] = { .type = NLA_U32 },
+ [NFTA_DYNSET_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 },
[NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 },
[NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 },
@@ -191,6 +193,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (IS_ERR(set))
return PTR_ERR(set);
+ if (set->flags & NFT_SET_OBJECT)
+ return -EOPNOTSUPP;
+
if (set->ops->update == NULL)
return -EOPNOTSUPP;
@@ -211,7 +216,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
return err;
}
- err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key,
+ err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key,
set->klen);
if (err < 0)
return err;
@@ -222,7 +227,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (set->dtype == NFT_DATA_VERDICT)
return -EOPNOTSUPP;
- err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_DATA],
+ err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_DATA],
&priv->sreg_data, set->dlen);
if (err < 0)
return err;
@@ -276,10 +281,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
priv->expr_array[i] = dynset_expr;
priv->num_exprs++;
- if (set->num_exprs &&
- dynset_expr->ops != set->exprs[i]->ops) {
- err = -EOPNOTSUPP;
- goto err_expr_free;
+ if (set->num_exprs) {
+ if (i >= set->num_exprs) {
+ err = -EINVAL;
+ goto err_expr_free;
+ }
+ if (dynset_expr->ops != set->exprs[i]->ops) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
}
i++;
}
@@ -303,12 +313,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (priv->num_exprs)
nft_dynset_ext_add_expr(priv);
- if (set->flags & NFT_SET_TIMEOUT) {
- if (timeout || set->timeout) {
- nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT);
- nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION);
- }
- }
+ if (set->flags & NFT_SET_TIMEOUT &&
+ (timeout || READ_ONCE(set->timeout)))
+ nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT);
priv->timeout = timeout;
@@ -342,7 +349,7 @@ static void nft_dynset_activate(const struct nft_ctx *ctx,
{
struct nft_dynset *priv = nft_expr_priv(expr);
- priv->set->use++;
+ nf_tables_activate_set(ctx, priv->set);
}
static void nft_dynset_destroy(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a54a7f772cec..7eedf4e3ae9c 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -5,11 +5,12 @@
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/kernel.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/dccp.h>
#include <linux/sctp.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
@@ -34,6 +35,14 @@ static unsigned int optlen(const u8 *opt, unsigned int offset)
return opt[offset + 1];
}
+static int nft_skb_copy_to_reg(const struct sk_buff *skb, int offset, u32 *dest, unsigned int len)
+{
+ if (len % NFT_REG32_SIZE)
+ dest[len / NFT_REG32_SIZE] = 0;
+
+ return skb_copy_bits(skb, offset, dest, len);
+}
+
static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -55,8 +64,7 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
}
offset += priv->offset;
- dest[priv->len / NFT_REG32_SIZE] = 0;
- if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0)
+ if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0)
goto err;
return;
err:
@@ -77,7 +85,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
unsigned char optbuf[sizeof(struct ip_options) + 40];
struct ip_options *opt = (struct ip_options *)optbuf;
struct iphdr *iph, _iph;
- unsigned int start;
bool found = false;
__be32 info;
int optlen;
@@ -85,7 +92,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (!iph)
return -EBADMSG;
- start = sizeof(struct iphdr);
optlen = iph->ihl * 4 - (int)sizeof(struct iphdr);
if (optlen <= 0)
@@ -95,7 +101,7 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
/* Copy the options since __ip_options_compile() modifies
* the options.
*/
- if (skb_copy_bits(skb, start, opt->__data, optlen))
+ if (skb_copy_bits(skb, sizeof(struct iphdr), opt->__data, optlen))
return -EBADMSG;
opt->optlen = optlen;
@@ -110,18 +116,18 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
found = target == IPOPT_SSRR ? opt->is_strictroute :
!opt->is_strictroute;
if (found)
- *offset = opt->srr + start;
+ *offset = opt->srr;
break;
case IPOPT_RR:
if (!opt->rr)
break;
- *offset = opt->rr + start;
+ *offset = opt->rr;
found = true;
break;
case IPOPT_RA:
if (!opt->router_alert)
break;
- *offset = opt->router_alert + start;
+ *offset = opt->router_alert;
found = true;
break;
default:
@@ -152,8 +158,7 @@ static void nft_exthdr_ipv4_eval(const struct nft_expr *expr,
}
offset += priv->offset;
- dest[priv->len / NFT_REG32_SIZE] = 0;
- if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0)
+ if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0)
goto err;
return;
err:
@@ -207,9 +212,10 @@ static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
offset = i + priv->offset;
if (priv->flags & NFT_EXTHDR_F_PRESENT) {
- *dest = 1;
+ nft_reg_store8(dest, 1);
} else {
- dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (priv->len % NFT_REG32_SIZE)
+ dest[priv->len / NFT_REG32_SIZE] = 0;
memcpy(dest, opt + offset, priv->len);
}
@@ -237,7 +243,12 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
if (!tcph)
goto err;
+ if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len))
+ goto err;
+
+ tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt));
opt = (u8 *)tcph;
+
for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
union {
__be16 v16;
@@ -252,15 +263,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
goto err;
- if (skb_ensure_writable(pkt->skb,
- nft_thoff(pkt) + i + priv->len))
- goto err;
-
- tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff,
- &tcphdr_len);
- if (!tcph)
- goto err;
-
offset = i + priv->offset;
switch (priv->len) {
@@ -324,9 +326,9 @@ static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr,
if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len))
goto drop;
- opt = (u8 *)nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
- if (!opt)
- goto err;
+ tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt));
+ opt = (u8 *)tcph;
+
for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
unsigned int j;
@@ -391,9 +393,8 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
offset + ntohs(sch->length) > pkt->skb->len)
break;
- dest[priv->len / NFT_REG32_SIZE] = 0;
- if (skb_copy_bits(pkt->skb, offset + priv->offset,
- dest, priv->len) < 0)
+ if (nft_skb_copy_to_reg(pkt->skb, offset + priv->offset,
+ dest, priv->len) < 0)
break;
return;
}
@@ -406,13 +407,91 @@ err:
regs->verdict.code = NFT_BREAK;
}
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+static void nft_exthdr_dccp_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ unsigned int thoff, dataoff, optoff, optlen, i;
+ u32 *dest = &regs->data[priv->dreg];
+ const struct dccp_hdr *dh;
+ struct dccp_hdr _dh;
+
+ if (pkt->tprot != IPPROTO_DCCP || pkt->fragoff)
+ goto err;
+
+ thoff = nft_thoff(pkt);
+
+ dh = skb_header_pointer(pkt->skb, thoff, sizeof(_dh), &_dh);
+ if (!dh)
+ goto err;
+
+ dataoff = dh->dccph_doff * sizeof(u32);
+ optoff = __dccp_hdr_len(dh);
+ if (dataoff <= optoff)
+ goto err;
+
+ optlen = dataoff - optoff;
+
+ for (i = 0; i < optlen; ) {
+ /* Options 0 (DCCPO_PADDING) - 31 (DCCPO_MAX_RESERVED) are 1B in
+ * the length; the remaining options are at least 2B long. In
+ * all cases, the first byte contains the option type. In
+ * multi-byte options, the second byte contains the option
+ * length, which must be at least two: 1 for the type plus 1 for
+ * the length plus 0-253 for any following option data. We
+ * aren't interested in the option data, only the type and the
+ * length, so we don't need to read more than two bytes at a
+ * time.
+ */
+ unsigned int buflen = optlen - i;
+ u8 buf[2], *bufp;
+ u8 type, len;
+
+ if (buflen > sizeof(buf))
+ buflen = sizeof(buf);
+
+ bufp = skb_header_pointer(pkt->skb, thoff + optoff + i, buflen,
+ &buf);
+ if (!bufp)
+ goto err;
+
+ type = bufp[0];
+
+ if (type == priv->type) {
+ nft_reg_store8(dest, 1);
+ return;
+ }
+
+ if (type <= DCCPO_MAX_RESERVED) {
+ i++;
+ continue;
+ }
+
+ if (buflen < 2)
+ goto err;
+
+ len = bufp[1];
+
+ if (len < 2)
+ goto err;
+
+ i += len;
+ }
+
+err:
+ *dest = 0;
+}
+#endif
+
static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
[NFTA_EXTHDR_DREG] = { .type = NLA_U32 },
[NFTA_EXTHDR_TYPE] = { .type = NLA_U8 },
[NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 },
- [NFTA_EXTHDR_LEN] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 },
- [NFTA_EXTHDR_OP] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_EXTHDR_SREG] = { .type = NLA_U32 },
};
@@ -509,7 +588,7 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
priv->flags = flags;
priv->op = op;
- return nft_parse_register_load(tb[NFTA_EXTHDR_SREG], &priv->sreg,
+ return nft_parse_register_load(ctx, tb[NFTA_EXTHDR_SREG], &priv->sreg,
priv->len);
}
@@ -557,6 +636,24 @@ static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
return 0;
}
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+static int nft_exthdr_dccp_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ int err = nft_exthdr_init(ctx, expr, tb);
+
+ if (err < 0)
+ return err;
+
+ if (!(priv->flags & NFT_EXTHDR_F_PRESENT))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+#endif
+
static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv)
{
if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
@@ -686,6 +783,17 @@ static const struct nft_expr_ops nft_exthdr_sctp_ops = {
.reduce = nft_exthdr_reduce,
};
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+static const struct nft_expr_ops nft_exthdr_dccp_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_dccp_eval,
+ .init = nft_exthdr_dccp_init,
+ .dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
+};
+#endif
+
static const struct nft_expr_ops *
nft_exthdr_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -720,6 +828,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_sctp_ops;
break;
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+ case NFT_EXTHDR_OP_DCCP:
+ if (tb[NFTA_EXTHDR_DREG])
+ return &nft_exthdr_dccp_ops;
+ break;
+#endif
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index 6e049fd48760..96e02a83c045 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -14,19 +14,19 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_fib.h>
+#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \
+ NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \
+ NFTA_FIB_F_PRESENT)
+
const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = {
[NFTA_FIB_DREG] = { .type = NLA_U32 },
[NFTA_FIB_RESULT] = { .type = NLA_U32 },
- [NFTA_FIB_FLAGS] = { .type = NLA_U32 },
+ [NFTA_FIB_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NFTA_FIB_F_ALL),
};
EXPORT_SYMBOL(nft_fib_policy);
-#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \
- NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \
- NFTA_FIB_F_PRESENT)
-
-int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
- const struct nft_data **data)
+int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
const struct nft_fib *priv = nft_expr_priv(expr);
unsigned int hooks;
@@ -34,11 +34,9 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
case NFT_FIB_RESULT_OIFNAME:
- hooks = (1 << NF_INET_PRE_ROUTING);
- if (priv->flags & NFTA_FIB_F_IIF) {
- hooks |= (1 << NF_INET_LOCAL_IN) |
- (1 << NF_INET_FORWARD);
- }
+ hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD);
break;
case NFT_FIB_RESULT_ADDRTYPE:
if (priv->flags & NFTA_FIB_F_IIF)
@@ -77,7 +75,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS]));
- if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL))
+ if (priv->flags == 0)
return -EINVAL;
if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) ==
@@ -144,13 +142,17 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv,
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
index = dev ? dev->ifindex : 0;
- *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index;
+ if (priv->flags & NFTA_FIB_F_PRESENT)
+ nft_reg_store8(dreg, !!index);
+ else
+ *dreg = index;
+
break;
case NFT_FIB_RESULT_OIFNAME:
if (priv->flags & NFTA_FIB_F_PRESENT)
- *dreg = !!dev;
+ nft_reg_store8(dreg, !!dev);
else
- strncpy(reg, dev ? dev->name : "", IFNAMSIZ);
+ strscpy_pad(reg, dev ? dev->name : "", IFNAMSIZ);
break;
default:
WARN_ON_ONCE(1);
@@ -203,4 +205,5 @@ bool nft_fib_reduce(struct nft_regs_track *track,
EXPORT_SYMBOL_GPL(nft_fib_reduce);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Query routing table from nftables");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index e860d8fe0e5e..b8f76c9057fd 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -8,7 +8,8 @@
#include <linux/spinlock.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/ip.h> /* for ipv4 options. */
+#include <net/ip.h>
+#include <net/flow.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -19,253 +20,6 @@ struct nft_flow_offload {
struct nft_flowtable *flowtable;
};
-static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst)
-{
- if (dst_xfrm(dst))
- return FLOW_OFFLOAD_XMIT_XFRM;
-
- return FLOW_OFFLOAD_XMIT_NEIGH;
-}
-
-static void nft_default_forward_path(struct nf_flow_route *route,
- struct dst_entry *dst_cache,
- enum ip_conntrack_dir dir)
-{
- route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex;
- route->tuple[dir].dst = dst_cache;
- route->tuple[dir].xmit_type = nft_xmit_type(dst_cache);
-}
-
-static bool nft_is_valid_ether_device(const struct net_device *dev)
-{
- if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
- dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
- return false;
-
- return true;
-}
-
-static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
- const struct dst_entry *dst_cache,
- const struct nf_conn *ct,
- enum ip_conntrack_dir dir, u8 *ha,
- struct net_device_path_stack *stack)
-{
- const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
- struct net_device *dev = dst_cache->dev;
- struct neighbour *n;
- u8 nud_state;
-
- if (!nft_is_valid_ether_device(dev))
- goto out;
-
- n = dst_neigh_lookup(dst_cache, daddr);
- if (!n)
- return -1;
-
- read_lock_bh(&n->lock);
- nud_state = n->nud_state;
- ether_addr_copy(ha, n->ha);
- read_unlock_bh(&n->lock);
- neigh_release(n);
-
- if (!(nud_state & NUD_VALID))
- return -1;
-
-out:
- return dev_fill_forward_path(dev, ha, stack);
-}
-
-struct nft_forward_info {
- const struct net_device *indev;
- const struct net_device *outdev;
- const struct net_device *hw_outdev;
- struct id {
- __u16 id;
- __be16 proto;
- } encap[NF_FLOW_TABLE_ENCAP_MAX];
- u8 num_encaps;
- u8 ingress_vlans;
- u8 h_source[ETH_ALEN];
- u8 h_dest[ETH_ALEN];
- enum flow_offload_xmit_type xmit_type;
-};
-
-static void nft_dev_path_info(const struct net_device_path_stack *stack,
- struct nft_forward_info *info,
- unsigned char *ha, struct nf_flowtable *flowtable)
-{
- const struct net_device_path *path;
- int i;
-
- memcpy(info->h_dest, ha, ETH_ALEN);
-
- for (i = 0; i < stack->num_paths; i++) {
- path = &stack->path[i];
- switch (path->type) {
- case DEV_PATH_ETHERNET:
- case DEV_PATH_DSA:
- case DEV_PATH_VLAN:
- case DEV_PATH_PPPOE:
- info->indev = path->dev;
- if (is_zero_ether_addr(info->h_source))
- memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
-
- if (path->type == DEV_PATH_ETHERNET)
- break;
- if (path->type == DEV_PATH_DSA) {
- i = stack->num_paths;
- break;
- }
-
- /* DEV_PATH_VLAN and DEV_PATH_PPPOE */
- if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
- info->indev = NULL;
- break;
- }
- if (!info->outdev)
- info->outdev = path->dev;
- info->encap[info->num_encaps].id = path->encap.id;
- info->encap[info->num_encaps].proto = path->encap.proto;
- info->num_encaps++;
- if (path->type == DEV_PATH_PPPOE)
- memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
- break;
- case DEV_PATH_BRIDGE:
- if (is_zero_ether_addr(info->h_source))
- memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
-
- switch (path->bridge.vlan_mode) {
- case DEV_PATH_BR_VLAN_UNTAG_HW:
- info->ingress_vlans |= BIT(info->num_encaps - 1);
- break;
- case DEV_PATH_BR_VLAN_TAG:
- info->encap[info->num_encaps].id = path->bridge.vlan_id;
- info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
- info->num_encaps++;
- break;
- case DEV_PATH_BR_VLAN_UNTAG:
- info->num_encaps--;
- break;
- case DEV_PATH_BR_VLAN_KEEP:
- break;
- }
- info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
- break;
- default:
- info->indev = NULL;
- break;
- }
- }
- if (!info->outdev)
- info->outdev = info->indev;
-
- info->hw_outdev = info->indev;
-
- if (nf_flowtable_hw_offload(flowtable) &&
- nft_is_valid_ether_device(info->indev))
- info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
-}
-
-static bool nft_flowtable_find_dev(const struct net_device *dev,
- struct nft_flowtable *ft)
-{
- struct nft_hook *hook;
- bool found = false;
-
- list_for_each_entry_rcu(hook, &ft->hook_list, list) {
- if (hook->ops.dev != dev)
- continue;
-
- found = true;
- break;
- }
-
- return found;
-}
-
-static void nft_dev_forward_path(struct nf_flow_route *route,
- const struct nf_conn *ct,
- enum ip_conntrack_dir dir,
- struct nft_flowtable *ft)
-{
- const struct dst_entry *dst = route->tuple[dir].dst;
- struct net_device_path_stack stack;
- struct nft_forward_info info = {};
- unsigned char ha[ETH_ALEN];
- int i;
-
- if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
- nft_dev_path_info(&stack, &info, ha, &ft->data);
-
- if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
- return;
-
- route->tuple[!dir].in.ifindex = info.indev->ifindex;
- for (i = 0; i < info.num_encaps; i++) {
- route->tuple[!dir].in.encap[i].id = info.encap[i].id;
- route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
- }
- route->tuple[!dir].in.num_encaps = info.num_encaps;
- route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
-
- if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
- memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
- memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
- route->tuple[dir].out.ifindex = info.outdev->ifindex;
- route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
- route->tuple[dir].xmit_type = info.xmit_type;
- }
-}
-
-static int nft_flow_route(const struct nft_pktinfo *pkt,
- const struct nf_conn *ct,
- struct nf_flow_route *route,
- enum ip_conntrack_dir dir,
- struct nft_flowtable *ft)
-{
- struct dst_entry *this_dst = skb_dst(pkt->skb);
- struct dst_entry *other_dst = NULL;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- switch (nft_pf(pkt)) {
- case NFPROTO_IPV4:
- fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
- fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
- fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
- fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
- fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos);
- fl.u.ip4.flowi4_mark = pkt->skb->mark;
- fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
- break;
- case NFPROTO_IPV6:
- fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
- fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6;
- fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
- fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
- fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
- fl.u.ip6.flowi6_mark = pkt->skb->mark;
- fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
- break;
- }
-
- nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
- if (!other_dst)
- return -ENOENT;
-
- nft_default_forward_path(route, this_dst, dir);
- nft_default_forward_path(route, other_dst, !dir);
-
- if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH &&
- route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
- nft_dev_forward_path(route, ct, dir, ft);
- nft_dev_forward_path(route, ct, !dir, ft);
- }
-
- return 0;
-}
-
static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
{
if (skb_sec_path(skb))
@@ -283,6 +37,15 @@ static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
return false;
}
+static void flow_offload_ct_tcp(struct nf_conn *ct)
+{
+ /* conntrack will not see all packets, disable tcp window validation. */
+ spin_lock_bh(&ct->lock);
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ spin_unlock_bh(&ct->lock);
+}
+
static void nft_flow_offload_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -349,24 +112,21 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
if (!flow)
goto err_flow_alloc;
- if (flow_offload_route_init(flow, &route) < 0)
- goto err_flow_add;
-
- if (tcph) {
- ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
- ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
- }
+ flow_offload_route_init(flow, &route);
+ if (tcph)
+ flow_offload_ct_tcp(ct);
+ __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
ret = flow_offload_add(flowtable, flow);
if (ret < 0)
goto err_flow_add;
- dst_release(route.tuple[!dir].dst);
return;
err_flow_add:
flow_offload_free(flow);
err_flow_alloc:
+ dst_release(route.tuple[dir].dst);
dst_release(route.tuple[!dir].dst);
err_flow_route:
clear_bit(IPS_OFFLOAD_BIT, &ct->status);
@@ -375,11 +135,15 @@ out:
}
static int nft_flow_offload_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
unsigned int hook_mask = (1 << NF_INET_FORWARD);
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
return nft_chain_validate_hooks(ctx->chain, hook_mask);
}
@@ -399,13 +163,15 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx,
if (!tb[NFTA_FLOW_TABLE_NAME])
return -EINVAL;
- flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME],
- genmask);
+ flowtable = nft_flowtable_lookup(ctx->net, ctx->table,
+ tb[NFTA_FLOW_TABLE_NAME], genmask);
if (IS_ERR(flowtable))
return PTR_ERR(flowtable);
+ if (!nft_use_inc(&flowtable->use))
+ return -EMFILE;
+
priv->flowtable = flowtable;
- flowtable->use++;
return nf_ct_netns_get(ctx->net, ctx->family);
}
@@ -424,7 +190,7 @@ static void nft_flow_offload_activate(const struct nft_ctx *ctx,
{
struct nft_flow_offload *priv = nft_expr_priv(expr);
- priv->flowtable->use++;
+ nft_use_inc_restore(&priv->flowtable->use);
}
static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 7b9d4d1bd17c..152a9fb4d23a 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -40,7 +40,7 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
[NFTA_FWD_SREG_DEV] = { .type = NLA_U32 },
[NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 },
- [NFTA_FWD_NFPROTO] = { .type = NLA_U32 },
+ [NFTA_FWD_NFPROTO] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -52,7 +52,7 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
if (tb[NFTA_FWD_SREG_DEV] == NULL)
return -EINVAL;
- return nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev,
+ return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev,
sizeof(int));
}
@@ -178,12 +178,12 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- err = nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev,
+ err = nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev,
sizeof(int));
if (err < 0)
return err;
- return nft_parse_register_load(tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr,
+ return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr,
addr_len);
}
@@ -204,8 +204,7 @@ nla_put_failure:
}
static int nft_fwd_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS) |
(1 << NF_NETDEV_EGRESS));
@@ -270,4 +269,5 @@ module_exit(nft_fwd_netdev_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("nftables netdev packet forwarding support");
MODULE_ALIAS_NFT_AF_EXPR(5, "fwd");
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index ee8d487b69c0..5d034bbb6913 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -51,7 +51,8 @@ static void nft_symhash_eval(const struct nft_expr *expr,
struct sk_buff *skb = pkt->skb;
u32 h;
- h = reciprocal_scale(__skb_get_hash_symmetric(skb), priv->modulus);
+ h = reciprocal_scale(__skb_get_hash_symmetric_net(nft_net(pkt), skb),
+ priv->modulus);
regs->data[priv->dreg] = h + priv->offset;
}
@@ -59,7 +60,7 @@ static void nft_symhash_eval(const struct nft_expr *expr,
static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
[NFTA_HASH_SREG] = { .type = NLA_U32 },
[NFTA_HASH_DREG] = { .type = NLA_U32 },
- [NFTA_HASH_LEN] = { .type = NLA_U32 },
+ [NFTA_HASH_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_HASH_MODULUS] = { .type = NLA_U32 },
[NFTA_HASH_SEED] = { .type = NLA_U32 },
[NFTA_HASH_OFFSET] = { .type = NLA_U32 },
@@ -91,7 +92,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
priv->len = len;
- err = nft_parse_register_load(tb[NFTA_HASH_SREG], &priv->sreg, len);
+ err = nft_parse_register_load(ctx, tb[NFTA_HASH_SREG], &priv->sreg, len);
if (err < 0)
return err;
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index c9d2f7c29f53..02ee5fb69871 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -76,11 +76,9 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
switch (priv->data.verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
- if (nft_chain_is_bound(chain)) {
- err = -EBUSY;
+ err = nf_tables_bind_chain(ctx, chain);
+ if (err < 0)
goto err1;
- }
- chain->bound = true;
break;
default:
break;
@@ -98,15 +96,86 @@ static void nft_immediate_activate(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data = &priv->data;
+ struct nft_ctx chain_ctx;
+ struct nft_chain *chain;
+ struct nft_rule *rule;
+
+ if (priv->dreg == NFT_REG_VERDICT) {
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ chain = data->verdict.chain;
+ if (!nft_chain_binding(chain))
+ break;
+
+ chain_ctx = *ctx;
+ chain_ctx.chain = chain;
+
+ list_for_each_entry(rule, &chain->rules, list)
+ nft_rule_expr_activate(&chain_ctx, rule);
+
+ nft_clear(ctx->net, chain);
+ break;
+ default:
+ break;
+ }
+ }
return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg));
}
+static void nft_immediate_chain_deactivate(const struct nft_ctx *ctx,
+ struct nft_chain *chain,
+ enum nft_trans_phase phase)
+{
+ struct nft_ctx chain_ctx;
+ struct nft_rule *rule;
+
+ chain_ctx = *ctx;
+ chain_ctx.chain = chain;
+
+ list_for_each_entry(rule, &chain->rules, list)
+ nft_rule_expr_deactivate(&chain_ctx, rule, phase);
+}
+
static void nft_immediate_deactivate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
enum nft_trans_phase phase)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data = &priv->data;
+ struct nft_chain *chain;
+
+ if (priv->dreg == NFT_REG_VERDICT) {
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ chain = data->verdict.chain;
+ if (!nft_chain_binding(chain))
+ break;
+
+ switch (phase) {
+ case NFT_TRANS_PREPARE_ERROR:
+ nf_tables_unbind_chain(ctx, chain);
+ nft_deactivate_next(ctx->net, chain);
+ break;
+ case NFT_TRANS_PREPARE:
+ nft_immediate_chain_deactivate(ctx, chain, phase);
+ nft_deactivate_next(ctx->net, chain);
+ break;
+ default:
+ nft_immediate_chain_deactivate(ctx, chain, phase);
+ nft_chain_del(chain);
+ chain->bound = false;
+ nft_use_dec(&chain->table->use);
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ }
if (phase == NFT_TRANS_COMMIT)
return;
@@ -131,16 +200,28 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx,
case NFT_GOTO:
chain = data->verdict.chain;
- if (!nft_chain_is_bound(chain))
+ if (!nft_chain_binding(chain))
+ break;
+
+ /* Rule construction failed, but chain is already bound:
+ * let the transaction records release this chain and its rules.
+ */
+ if (chain->bound) {
+ nft_use_dec(&chain->use);
break;
+ }
+ /* Rule has been deleted, release chain and its rules. */
chain_ctx = *ctx;
chain_ctx.chain = chain;
- list_for_each_entry_safe(rule, n, &chain->rules, list)
- nf_tables_rule_release(&chain_ctx, rule);
-
- nf_tables_chain_destroy(&chain_ctx);
+ nft_use_dec(&chain->use);
+ list_for_each_entry_safe(rule, n, &chain->rules, list) {
+ nft_use_dec(&chain->use);
+ list_del(&rule->list);
+ nf_tables_rule_destroy(&chain_ctx, rule);
+ }
+ nf_tables_chain_destroy(chain);
break;
default:
break;
@@ -163,8 +244,7 @@ nla_put_failure:
}
static int nft_immediate_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **d)
+ const struct nft_expr *expr)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
struct nft_ctx *pctx = (struct nft_ctx *)ctx;
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 28e2873ba24e..c4569d4b9228 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -23,7 +23,14 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
-static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx);
+struct nft_inner_tun_ctx_locked {
+ struct nft_inner_tun_ctx ctx;
+ local_lock_t bh_lock;
+};
+
+static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
/* Same layout as nft_expr but it embeds the private expression data area. */
struct __nft_expr {
@@ -210,35 +217,71 @@ static int nft_inner_parse(const struct nft_inner *priv,
struct nft_pktinfo *pkt,
struct nft_inner_tun_ctx *tun_ctx)
{
- struct nft_inner_tun_ctx ctx = {};
u32 off = pkt->inneroff;
if (priv->flags & NFT_INNER_HDRSIZE &&
- nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
+ nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
return -1;
if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
- if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
+ if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
return -1;
} else if (priv->flags & NFT_INNER_TH) {
- ctx.inner_thoff = off;
- ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ tun_ctx->inner_thoff = off;
+ tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
}
- *tun_ctx = ctx;
tun_ctx->type = priv->type;
+ tun_ctx->cookie = (unsigned long)pkt->skb;
pkt->flags |= NFT_PKTINFO_INNER_FULL;
return 0;
}
+static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+ local_bh_disable();
+ local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
+ if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) {
+ local_bh_enable();
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ return false;
+ }
+ *tun_ctx = *this_cpu_tun_ctx;
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ local_bh_enable();
+
+ return true;
+}
+
+static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt,
+ const struct nft_inner_tun_ctx *tun_ctx)
+{
+ struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+ local_bh_disable();
+ local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
+ if (this_cpu_tun_ctx->cookie != tun_ctx->cookie)
+ *this_cpu_tun_ctx = *tun_ctx;
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ local_bh_enable();
+}
+
static bool nft_inner_parse_needed(const struct nft_inner *priv,
const struct nft_pktinfo *pkt,
- const struct nft_inner_tun_ctx *tun_ctx)
+ struct nft_inner_tun_ctx *tun_ctx)
{
if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
return true;
+ if (!nft_inner_restore_tun_ctx(pkt, tun_ctx))
+ return true;
+
if (priv->type != tun_ctx->type)
return true;
@@ -248,27 +291,29 @@ static bool nft_inner_parse_needed(const struct nft_inner *priv,
static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
const struct nft_inner *priv = nft_expr_priv(expr);
+ struct nft_inner_tun_ctx tun_ctx = {};
if (nft_payload_inner_offset(pkt) < 0)
goto err;
- if (nft_inner_parse_needed(priv, pkt, tun_ctx) &&
- nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0)
+ if (nft_inner_parse_needed(priv, pkt, &tun_ctx) &&
+ nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0)
goto err;
switch (priv->expr_type) {
case NFT_INNER_EXPR_PAYLOAD:
- nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+ nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
break;
case NFT_INNER_EXPR_META:
- nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+ nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
break;
default:
WARN_ON_ONCE(1);
goto err;
}
+ nft_inner_save_tun_ctx(pkt, &tun_ctx);
+
return;
err:
regs->verdict.code = NFT_BREAK;
@@ -298,6 +343,7 @@ static int nft_inner_init(const struct nft_ctx *ctx,
int err;
if (!tb[NFTA_INNER_FLAGS] ||
+ !tb[NFTA_INNER_NUM] ||
!tb[NFTA_INNER_HDRSIZE] ||
!tb[NFTA_INNER_TYPE] ||
!tb[NFTA_INNER_EXPR])
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index 7f2bda6641bd..de1b6066bfa8 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -102,14 +102,18 @@ static void nft_last_destroy(const struct nft_ctx *ctx,
kfree(priv->last);
}
-static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_last_priv *priv_dst = nft_expr_priv(dst);
+ struct nft_last_priv *priv_src = nft_expr_priv(src);
- priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC);
+ priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp);
if (!priv_dst->last)
return -ENOMEM;
+ priv_dst->last->set = priv_src->last->set;
+ priv_dst->last->jiffies = priv_src->last->jiffies;
+
return 0;
}
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 145dc62c6247..21d26b79b460 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -58,16 +58,19 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost)
static int nft_limit_init(struct nft_limit_priv *priv,
const struct nlattr * const tb[], bool pkts)
{
- u64 unit, tokens;
+ u64 unit, tokens, rate_with_burst;
+ bool invert = false;
if (tb[NFTA_LIMIT_RATE] == NULL ||
tb[NFTA_LIMIT_UNIT] == NULL)
return -EINVAL;
priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ if (priv->rate == 0)
+ return -EINVAL;
+
unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
- priv->nsecs = unit * NSEC_PER_SEC;
- if (priv->rate == 0 || priv->nsecs < unit)
+ if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs))
return -EOVERFLOW;
if (tb[NFTA_LIMIT_BURST])
@@ -76,18 +79,35 @@ static int nft_limit_init(struct nft_limit_priv *priv,
if (pkts && priv->burst == 0)
priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
- if (priv->rate + priv->burst < priv->rate)
+ if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst))
return -EOVERFLOW;
if (pkts) {
- tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst;
+ u64 tmp = div64_u64(priv->nsecs, priv->rate);
+
+ if (check_mul_overflow(tmp, priv->burst, &tokens))
+ return -EOVERFLOW;
} else {
+ u64 tmp;
+
/* The token bucket size limits the number of tokens can be
* accumulated. tokens_max specifies the bucket size.
* tokens_max = unit * (rate + burst) / rate.
*/
- tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst),
- priv->rate);
+ if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp))
+ return -EOVERFLOW;
+
+ tokens = div64_u64(tmp, priv->rate);
+ }
+
+ if (tb[NFTA_LIMIT_FLAGS]) {
+ u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS]));
+
+ if (flags & ~NFT_LIMIT_F_INV)
+ return -EOPNOTSUPP;
+
+ if (flags & NFT_LIMIT_F_INV)
+ invert = true;
}
priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT);
@@ -96,13 +116,7 @@ static int nft_limit_init(struct nft_limit_priv *priv,
priv->limit->tokens = tokens;
priv->tokens_max = priv->limit->tokens;
-
- if (tb[NFTA_LIMIT_FLAGS]) {
- u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS]));
-
- if (flags & NFT_LIMIT_F_INV)
- priv->invert = true;
- }
+ priv->invert = invert;
priv->limit->last = ktime_get_ns();
spin_lock_init(&priv->limit->lock);
@@ -136,7 +150,7 @@ static void nft_limit_destroy(const struct nft_ctx *ctx,
}
static int nft_limit_clone(struct nft_limit_priv *priv_dst,
- const struct nft_limit_priv *priv_src)
+ const struct nft_limit_priv *priv_src, gfp_t gfp)
{
priv_dst->tokens_max = priv_src->tokens_max;
priv_dst->rate = priv_src->rate;
@@ -144,7 +158,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst,
priv_dst->burst = priv_src->burst;
priv_dst->invert = priv_src->invert;
- priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC);
+ priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp);
if (!priv_dst->limit)
return -ENOMEM;
@@ -209,14 +223,15 @@ static void nft_limit_pkts_destroy(const struct nft_ctx *ctx,
nft_limit_destroy(ctx, &priv->limit);
}
-static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src,
+ gfp_t gfp)
{
struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst);
struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src);
priv_dst->cost = priv_src->cost;
- return nft_limit_clone(&priv_dst->limit, &priv_src->limit);
+ return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp);
}
static struct nft_expr_type nft_limit_type;
@@ -267,12 +282,13 @@ static void nft_limit_bytes_destroy(const struct nft_ctx *ctx,
nft_limit_destroy(ctx, priv);
}
-static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src,
+ gfp_t gfp)
{
struct nft_limit_priv *priv_dst = nft_expr_priv(dst);
struct nft_limit_priv *priv_src = nft_expr_priv(src);
- return nft_limit_clone(priv_dst, priv_src);
+ return nft_limit_clone(priv_dst, priv_src, gfp);
}
static const struct nft_expr_ops nft_limit_bytes_ops = {
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 5defe6e4fd98..e35588137995 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -163,7 +163,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
nla = tb[NFTA_LOG_PREFIX];
if (nla != NULL) {
- priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
+ priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL_ACCOUNT);
if (priv->prefix == NULL)
return -ENOMEM;
nla_strscpy(priv->prefix, nla, nla_len(nla) + 1);
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index cae5a6724163..fc2d7c5d83c8 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -19,40 +19,78 @@ struct nft_lookup {
struct nft_set *set;
u8 sreg;
u8 dreg;
+ bool dreg_set;
bool invert;
struct nft_set_binding binding;
};
-#ifdef CONFIG_RETPOLINE
-bool nft_set_do_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+static const struct nft_set_ext *
+__nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
+#ifdef CONFIG_MITIGATION_RETPOLINE
if (set->ops == &nft_set_hash_fast_type.ops)
- return nft_hash_lookup_fast(net, set, key, ext);
+ return nft_hash_lookup_fast(net, set, key);
if (set->ops == &nft_set_hash_type.ops)
- return nft_hash_lookup(net, set, key, ext);
+ return nft_hash_lookup(net, set, key);
if (set->ops == &nft_set_rhash_type.ops)
- return nft_rhash_lookup(net, set, key, ext);
+ return nft_rhash_lookup(net, set, key);
if (set->ops == &nft_set_bitmap_type.ops)
- return nft_bitmap_lookup(net, set, key, ext);
+ return nft_bitmap_lookup(net, set, key);
if (set->ops == &nft_set_pipapo_type.ops)
- return nft_pipapo_lookup(net, set, key, ext);
+ return nft_pipapo_lookup(net, set, key);
#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
if (set->ops == &nft_set_pipapo_avx2_type.ops)
- return nft_pipapo_avx2_lookup(net, set, key, ext);
+ return nft_pipapo_avx2_lookup(net, set, key);
#endif
if (set->ops == &nft_set_rbtree_type.ops)
- return nft_rbtree_lookup(net, set, key, ext);
+ return nft_rbtree_lookup(net, set, key);
WARN_ON_ONCE(1);
- return set->ops->lookup(net, set, key, ext);
+#endif
+ return set->ops->lookup(net, set, key);
+}
+
+static unsigned int nft_base_seq(const struct net *net)
+{
+ /* pairs with smp_store_release() in nf_tables_commit() */
+ return smp_load_acquire(&net->nft.base_seq);
+}
+
+static bool nft_lookup_should_retry(const struct net *net, unsigned int seq)
+{
+ return unlikely(seq != nft_base_seq(net));
+}
+
+const struct nft_set_ext *
+nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ const struct nft_set_ext *ext;
+ unsigned int base_seq;
+
+ do {
+ base_seq = nft_base_seq(net);
+
+ ext = __nft_set_do_lookup(net, set, key);
+ if (ext)
+ break;
+ /* No match? There is a small chance that lookup was
+ * performed in the old generation, but nf_tables_commit()
+ * already unlinked a (matching) element.
+ *
+ * We need to repeat the lookup to make sure that we didn't
+ * miss a matching element in the new generation.
+ */
+ } while (nft_lookup_should_retry(net, base_seq));
+
+ return ext;
}
EXPORT_SYMBOL_GPL(nft_set_do_lookup);
-#endif
void nft_lookup_eval(const struct nft_expr *expr,
struct nft_regs *regs,
@@ -60,12 +98,12 @@ void nft_lookup_eval(const struct nft_expr *expr,
{
const struct nft_lookup *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
- const struct nft_set_ext *ext = NULL;
const struct net *net = nft_net(pkt);
+ const struct nft_set_ext *ext;
bool found;
- found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext) ^
- priv->invert;
+ ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]);
+ found = !!ext ^ priv->invert;
if (!found) {
ext = nft_set_catchall_lookup(net, set);
if (!ext) {
@@ -75,7 +113,7 @@ void nft_lookup_eval(const struct nft_expr *expr,
}
if (ext) {
- if (set->flags & NFT_SET_MAP)
+ if (priv->dreg_set)
nft_data_copy(&regs->data[priv->dreg],
nft_set_ext_data(ext), set->dlen);
@@ -89,7 +127,8 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
[NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 },
[NFTA_LOOKUP_SREG] = { .type = NLA_U32 },
[NFTA_LOOKUP_DREG] = { .type = NLA_U32 },
- [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 },
+ [NFTA_LOOKUP_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV),
};
static int nft_lookup_init(const struct nft_ctx *ctx,
@@ -111,7 +150,7 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
if (IS_ERR(set))
return PTR_ERR(set);
- err = nft_parse_register_load(tb[NFTA_LOOKUP_SREG], &priv->sreg,
+ err = nft_parse_register_load(ctx, tb[NFTA_LOOKUP_SREG], &priv->sreg,
set->klen);
if (err < 0)
return err;
@@ -119,14 +158,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
if (tb[NFTA_LOOKUP_FLAGS]) {
flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS]));
- if (flags & ~NFT_LOOKUP_F_INV)
- return -EINVAL;
-
- if (flags & NFT_LOOKUP_F_INV) {
- if (set->flags & NFT_SET_MAP)
- return -EINVAL;
+ if (flags & NFT_LOOKUP_F_INV)
priv->invert = true;
- }
}
if (tb[NFTA_LOOKUP_DREG] != NULL) {
@@ -136,12 +169,22 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
return -EINVAL;
err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG],
- &priv->dreg, NULL, set->dtype,
+ &priv->dreg, NULL,
+ nft_set_datatype(set),
set->dlen);
if (err < 0)
return err;
- } else if (set->flags & NFT_SET_MAP)
- return -EINVAL;
+ priv->dreg_set = true;
+ } else if (set->flags & NFT_SET_MAP) {
+ /* Map given, but user asks for lookup only (i.e. to
+ * ignore value assoicated with key).
+ *
+ * This makes no sense for anonymous maps since they are
+ * scoped to the rule, but for named sets this can be useful.
+ */
+ if (set->flags & NFT_SET_ANONYMOUS)
+ return -EINVAL;
+ }
priv->binding.flags = set->flags & NFT_SET_MAP;
@@ -167,7 +210,7 @@ static void nft_lookup_activate(const struct nft_ctx *ctx,
{
struct nft_lookup *priv = nft_expr_priv(expr);
- priv->set->use++;
+ nf_tables_activate_set(ctx, priv->set);
}
static void nft_lookup_destroy(const struct nft_ctx *ctx,
@@ -188,7 +231,7 @@ static int nft_lookup_dump(struct sk_buff *skb,
goto nla_put_failure;
if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg))
goto nla_put_failure;
- if (priv->set->flags & NFT_SET_MAP)
+ if (priv->dreg_set)
if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags)))
@@ -199,55 +242,24 @@ nla_put_failure:
return -1;
}
-static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
-{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
- struct nft_ctx *pctx = (struct nft_ctx *)ctx;
- const struct nft_data *data;
- int err;
-
- if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
- *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
- return 0;
-
- data = nft_set_ext_data(ext);
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- pctx->level++;
- err = nft_chain_validate(ctx, data->verdict.chain);
- if (err < 0)
- return err;
- pctx->level--;
- break;
- default:
- break;
- }
-
- return 0;
-}
-
static int nft_lookup_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **d)
+ const struct nft_expr *expr)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
- struct nft_set_iter iter;
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_setelem_validate,
+ };
if (!(priv->set->flags & NFT_SET_MAP) ||
priv->set->dtype != NFT_DATA_VERDICT)
return 0;
- iter.genmask = nft_genmask_next(ctx->net);
- iter.skip = 0;
- iter.count = 0;
- iter.err = 0;
- iter.fn = nft_lookup_validate_setelem;
-
priv->set->ops->walk(ctx, priv->set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_validate(ctx, priv->set);
+
if (iter.err < 0)
return iter.err;
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index e55e455275c4..868bd4d73555 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -20,14 +20,14 @@ struct nft_masq {
};
static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
- [NFTA_MASQ_FLAGS] = { .type = NLA_U32 },
+ [NFTA_MASQ_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK),
[NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 },
[NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 },
};
static int nft_masq_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
int err;
@@ -43,24 +43,21 @@ static int nft_masq_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- u32 plen = sizeof_field(struct nf_nat_range, min_addr.all);
+ u32 plen = sizeof_field(struct nf_nat_range, min_proto.all);
struct nft_masq *priv = nft_expr_priv(expr);
int err;
- if (tb[NFTA_MASQ_FLAGS]) {
+ if (tb[NFTA_MASQ_FLAGS])
priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EINVAL;
- }
if (tb[NFTA_MASQ_REG_PROTO_MIN]) {
- err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN],
+ err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MIN],
&priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_MASQ_REG_PROTO_MAX]) {
- err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MAX],
+ err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MAX],
&priv->sreg_proto_max,
plen);
if (err < 0)
@@ -96,23 +93,39 @@ nla_put_failure:
return -1;
}
-static void nft_masq_ipv4_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static void nft_masq_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
- struct nft_masq *priv = nft_expr_priv(expr);
+ const struct nft_masq *priv = nft_expr_priv(expr);
struct nf_nat_range2 range;
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
+ range.min_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_max]);
+ }
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb,
+ nft_hook(pkt),
+ &range,
+ nft_out(pkt));
+ break;
+#ifdef CONFIG_NF_TABLES_IPV6
+ case NFPROTO_IPV6:
+ regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range,
+ nft_out(pkt));
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
- regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt),
- &range, nft_out(pkt));
}
static void
@@ -125,7 +138,7 @@ static struct nft_expr_type nft_masq_ipv4_type;
static const struct nft_expr_ops nft_masq_ipv4_ops = {
.type = &nft_masq_ipv4_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
- .eval = nft_masq_ipv4_eval,
+ .eval = nft_masq_eval,
.init = nft_masq_init,
.destroy = nft_masq_ipv4_destroy,
.dump = nft_masq_dump,
@@ -143,25 +156,6 @@ static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
};
#ifdef CONFIG_NF_TABLES_IPV6
-static void nft_masq_ipv6_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_masq *priv = nft_expr_priv(expr);
- struct nf_nat_range2 range;
-
- memset(&range, 0, sizeof(range));
- range.flags = priv->flags;
- if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- }
- regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range,
- nft_out(pkt));
-}
-
static void
nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
@@ -172,7 +166,7 @@ static struct nft_expr_type nft_masq_ipv6_type;
static const struct nft_expr_ops nft_masq_ipv6_ops = {
.type = &nft_masq_ipv6_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
- .eval = nft_masq_ipv6_eval,
+ .eval = nft_masq_eval,
.init = nft_masq_init,
.destroy = nft_masq_ipv6_destroy,
.dump = nft_masq_dump,
@@ -204,20 +198,6 @@ static inline void nft_masq_module_exit_ipv6(void) {}
#endif
#ifdef CONFIG_NF_TABLES_INET
-static void nft_masq_inet_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- switch (nft_pf(pkt)) {
- case NFPROTO_IPV4:
- return nft_masq_ipv4_eval(expr, regs, pkt);
- case NFPROTO_IPV6:
- return nft_masq_ipv6_eval(expr, regs, pkt);
- }
-
- WARN_ON_ONCE(1);
-}
-
static void
nft_masq_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
@@ -228,7 +208,7 @@ static struct nft_expr_type nft_masq_inet_type;
static const struct nft_expr_ops nft_masq_inet_ops = {
.type = &nft_masq_inet_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
- .eval = nft_masq_inet_eval,
+ .eval = nft_masq_eval,
.init = nft_masq_init,
.destroy = nft_masq_inet_destroy,
.dump = nft_masq_dump,
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index e384e0de7a54..05cd1e6e6a2f 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -63,7 +63,7 @@ nft_meta_get_eval_time(enum nft_meta_keys key,
{
switch (key) {
case NFT_META_TIME_NS:
- nft_reg_store64(dest, ktime_get_real_ns());
+ nft_reg_store64((u64 *)dest, ktime_get_real_ns());
break;
case NFT_META_TIME_DAY:
nft_reg_store8(dest, nft_meta_weekday());
@@ -185,12 +185,12 @@ static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key,
case NFT_META_IIFKIND:
if (!in || !in->rtnl_link_ops)
return false;
- strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ);
+ strscpy_pad((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ);
break;
case NFT_META_OIFKIND:
if (!out || !out->rtnl_link_ops)
return false;
- strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ);
+ strscpy_pad((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ);
break;
default:
return false;
@@ -206,7 +206,7 @@ static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev)
static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev)
{
- strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ);
+ strscpy_pad((char *)dest, dev ? dev->name : "", IFNAMSIZ);
}
static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev)
@@ -458,7 +458,7 @@ EXPORT_SYMBOL_GPL(nft_meta_set_eval);
const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
[NFTA_META_DREG] = { .type = NLA_U32 },
- [NFTA_META_KEY] = { .type = NLA_U32 },
+ [NFTA_META_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_META_SREG] = { .type = NLA_U32 },
};
EXPORT_SYMBOL_GPL(nft_meta_policy);
@@ -581,8 +581,7 @@ static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx)
}
static int nft_meta_get_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -600,8 +599,7 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
}
int nft_meta_set_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
@@ -657,7 +655,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
}
priv->len = len;
- err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len);
+ err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len);
if (err < 0)
return err;
@@ -839,6 +837,9 @@ static int nft_meta_inner_init(const struct nft_ctx *ctx,
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int len;
+ if (!tb[NFTA_META_KEY] || !tb[NFTA_META_DREG])
+ return -EINVAL;
+
priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
switch (priv->key) {
case NFT_META_PROTOCOL:
@@ -951,7 +952,7 @@ static int nft_secmark_obj_init(const struct nft_ctx *ctx,
if (tb[NFTA_SECMARK_CTX] == NULL)
return -EINVAL;
- priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL);
+ priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL_ACCOUNT);
if (!priv->ctx)
return -ENOMEM;
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 047999150390..6e21f72c5b57 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -132,16 +132,21 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
[NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 },
[NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 },
[NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 },
- [NFTA_NAT_FLAGS] = { .type = NLA_U32 },
+ [NFTA_NAT_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK),
};
static int nft_nat_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct nft_nat *priv = nft_expr_priv(expr);
int err;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
if (err < 0)
return err;
@@ -208,13 +213,13 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->family = family;
if (tb[NFTA_NAT_REG_ADDR_MIN]) {
- err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN],
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MIN],
&priv->sreg_addr_min, alen);
if (err < 0)
return err;
if (tb[NFTA_NAT_REG_ADDR_MAX]) {
- err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX],
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MAX],
&priv->sreg_addr_max,
alen);
if (err < 0)
@@ -226,15 +231,15 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->flags |= NF_NAT_RANGE_MAP_IPS;
}
- plen = sizeof_field(struct nf_nat_range, min_addr.all);
+ plen = sizeof_field(struct nf_nat_range, min_proto.all);
if (tb[NFTA_NAT_REG_PROTO_MIN]) {
- err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN],
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MIN],
&priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_NAT_REG_PROTO_MAX]) {
- err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX],
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MAX],
&priv->sreg_proto_max,
plen);
if (err < 0)
@@ -246,11 +251,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
- if (tb[NFTA_NAT_FLAGS]) {
+ if (tb[NFTA_NAT_FLAGS])
priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EOPNOTSUPP;
- }
return nf_ct_netns_get(ctx->net, family);
}
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 7d29db7c2ac0..bd058babfc82 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -66,7 +66,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL);
+ priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT);
if (!priv->counter)
return -ENOMEM;
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 7b01aa2ef653..1a62e384766a 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -13,15 +13,44 @@
#define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr))
-static void nft_objref_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_objref_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_object *obj = nft_objref_priv(expr);
obj->ops->eval(obj, regs, pkt);
}
+static int nft_objref_validate_obj_type(const struct nft_ctx *ctx, u32 type)
+{
+ unsigned int hooks;
+
+ switch (type) {
+ case NFT_OBJECT_SYNPROXY:
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
+ hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD);
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int nft_objref_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+
+ return nft_objref_validate_obj_type(ctx, obj->ops->type->type);
+}
+
static int nft_objref_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -41,8 +70,10 @@ static int nft_objref_init(const struct nft_ctx *ctx,
if (IS_ERR(obj))
return -ENOENT;
+ if (!nft_use_inc(&obj->use))
+ return -EMFILE;
+
nft_objref_priv(expr) = obj;
- obj->use++;
return 0;
}
@@ -72,7 +103,7 @@ static void nft_objref_deactivate(const struct nft_ctx *ctx,
if (phase == NFT_TRANS_COMMIT)
return;
- obj->use--;
+ nft_use_dec(&obj->use);
}
static void nft_objref_activate(const struct nft_ctx *ctx,
@@ -80,7 +111,7 @@ static void nft_objref_activate(const struct nft_ctx *ctx,
{
struct nft_object *obj = nft_objref_priv(expr);
- obj->use++;
+ nft_use_inc_restore(&obj->use);
}
static const struct nft_expr_ops nft_objref_ops = {
@@ -91,6 +122,7 @@ static const struct nft_expr_ops nft_objref_ops = {
.activate = nft_objref_activate,
.deactivate = nft_objref_deactivate,
.dump = nft_objref_dump,
+ .validate = nft_objref_validate,
.reduce = NFT_REDUCE_READONLY,
};
@@ -100,19 +132,18 @@ struct nft_objref_map {
struct nft_set_binding binding;
};
-static void nft_objref_map_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_objref_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_objref_map *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
struct net *net = nft_net(pkt);
const struct nft_set_ext *ext;
struct nft_object *obj;
- bool found;
- found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext);
- if (!found) {
+ ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]);
+ if (!ext) {
ext = nft_set_catchall_lookup(net, set);
if (!ext) {
regs->verdict.code = NFT_BREAK;
@@ -141,7 +172,7 @@ static int nft_objref_map_init(const struct nft_ctx *ctx,
if (!(set->flags & NFT_SET_OBJECT))
return -EINVAL;
- err = nft_parse_register_load(tb[NFTA_OBJREF_SET_SREG], &priv->sreg,
+ err = nft_parse_register_load(ctx, tb[NFTA_OBJREF_SET_SREG], &priv->sreg,
set->klen);
if (err < 0)
return err;
@@ -185,7 +216,7 @@ static void nft_objref_map_activate(const struct nft_ctx *ctx,
{
struct nft_objref_map *priv = nft_expr_priv(expr);
- priv->set->use++;
+ nf_tables_activate_set(ctx, priv->set);
}
static void nft_objref_map_destroy(const struct nft_ctx *ctx,
@@ -196,6 +227,14 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
+static int nft_objref_map_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ return nft_objref_validate_obj_type(ctx, priv->set->objtype);
+}
+
static const struct nft_expr_ops nft_objref_map_ops = {
.type = &nft_objref_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
@@ -205,6 +244,7 @@ static const struct nft_expr_ops nft_objref_map_ops = {
.deactivate = nft_objref_map_deactivate,
.destroy = nft_objref_map_destroy,
.dump = nft_objref_map_dump,
+ .validate = nft_objref_map_validate,
.reduce = NFT_REDUCE_READONLY,
};
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index 70820c66b591..1c0b493ef0a9 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -23,7 +23,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct nft_osf *priv = nft_expr_priv(expr);
u32 *dest = &regs->data[priv->dreg];
struct sk_buff *skb = pkt->skb;
- char os_match[NFT_OSF_MAXGENRELEN + 1];
+ char os_match[NFT_OSF_MAXGENRELEN];
const struct tcphdr *tcp;
struct nf_osf_data data;
struct tcphdr _tcph;
@@ -45,7 +45,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
}
if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) {
- strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
+ strscpy_pad((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
} else {
if (priv->flags & NFT_OSF_F_VERSION)
snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s",
@@ -53,7 +53,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
else
strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN);
- strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN);
+ strscpy_pad((char *)dest, os_match, NFT_OSF_MAXGENRELEN);
}
}
@@ -63,7 +63,6 @@ static int nft_osf_init(const struct nft_ctx *ctx,
{
struct nft_osf *priv = nft_expr_priv(expr);
u32 flags;
- int err;
u8 ttl;
if (!tb[NFTA_OSF_DREG])
@@ -83,13 +82,9 @@ static int nft_osf_init(const struct nft_ctx *ctx,
priv->flags = flags;
}
- err = nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg,
- NULL, NFT_DATA_VALUE,
- NFT_OSF_MAXGENRELEN);
- if (err < 0)
- return err;
-
- return 0;
+ return nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE,
+ NFT_OSF_MAXGENRELEN);
}
static int nft_osf_dump(struct sk_buff *skb,
@@ -113,8 +108,7 @@ nla_put_failure:
}
static int nft_osf_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
unsigned int hooks;
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 3a3c7746e88f..b0214418f75a 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -40,41 +40,32 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
-nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
+nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len)
{
int mac_off = skb_mac_header(skb) - skb->data;
u8 *vlanh, *dst_u8 = (u8 *) d;
struct vlan_ethhdr veth;
- u8 vlan_hlen = 0;
-
- if ((skb->protocol == htons(ETH_P_8021AD) ||
- skb->protocol == htons(ETH_P_8021Q)) &&
- offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN)
- vlan_hlen += VLAN_HLEN;
vlanh = (u8 *) &veth;
- if (offset < VLAN_ETH_HLEN + vlan_hlen) {
+ if (offset < VLAN_ETH_HLEN) {
u8 ethlen = len;
- if (vlan_hlen &&
- skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0)
- return false;
- else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth))
+ if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth))
return false;
- if (offset + len > VLAN_ETH_HLEN + vlan_hlen)
- ethlen -= offset + len - VLAN_ETH_HLEN - vlan_hlen;
+ if (offset + len > VLAN_ETH_HLEN)
+ ethlen -= offset + len - VLAN_ETH_HLEN;
- memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen);
+ memcpy(dst_u8, vlanh + offset, ethlen);
len -= ethlen;
if (len == 0)
return true;
dst_u8 += ethlen;
- offset = ETH_HLEN + vlan_hlen;
+ offset = ETH_HLEN;
} else {
- offset -= VLAN_HLEN + vlan_hlen;
+ offset -= VLAN_HLEN;
}
return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0;
@@ -154,6 +145,17 @@ int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
return pkt->inneroff;
}
+static bool nft_payload_need_vlan_adjust(u32 offset, u32 len)
+{
+ unsigned int boundary = offset + len;
+
+ /* data past ether src/dst requested, copy needed */
+ if (boundary > offsetof(struct ethhdr, h_proto))
+ return true;
+
+ return false;
+}
+
void nft_payload_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -168,10 +170,11 @@ void nft_payload_eval(const struct nft_expr *expr,
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
- if (!skb_mac_header_was_set(skb))
+ if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) == 0)
goto err;
- if (skb_vlan_tag_present(skb)) {
+ if (skb_vlan_tag_present(skb) &&
+ nft_payload_need_vlan_adjust(priv->offset, priv->len)) {
if (!nft_payload_copy_vlan(dest, skb,
priv->offset, priv->len))
goto err;
@@ -209,7 +212,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
[NFTA_PAYLOAD_SREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 },
[NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
[NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
@@ -647,6 +650,10 @@ static int nft_payload_inner_init(const struct nft_ctx *ctx,
struct nft_payload *priv = nft_expr_priv(expr);
u32 base;
+ if (!tb[NFTA_PAYLOAD_BASE] || !tb[NFTA_PAYLOAD_OFFSET] ||
+ !tb[NFTA_PAYLOAD_LEN] || !tb[NFTA_PAYLOAD_DREG])
+ return -EINVAL;
+
base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
switch (base) {
case NFT_PAYLOAD_TUN_HEADER:
@@ -677,7 +684,7 @@ static const struct nft_expr_ops nft_payload_inner_ops = {
static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
{
- *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
+ csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum);
if (*sum == 0)
*sum = CSUM_MANGLED_0;
}
@@ -790,7 +797,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
struct nft_payload_set {
enum nft_payload_bases base:8;
- u8 offset;
+ u16 offset;
u8 len;
u8 sreg;
u8 csum_type;
@@ -798,21 +805,79 @@ struct nft_payload_set {
u8 csum_flags;
};
+/* This is not struct vlan_hdr. */
+struct nft_payload_vlan_hdr {
+ __be16 h_vlan_proto;
+ __be16 h_vlan_TCI;
+};
+
+static bool
+nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len,
+ int *vlan_hlen)
+{
+ struct nft_payload_vlan_hdr *vlanh;
+ __be16 vlan_proto;
+ u16 vlan_tci;
+
+ if (offset >= offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto)) {
+ *vlan_hlen = VLAN_HLEN;
+ return true;
+ }
+
+ switch (offset) {
+ case offsetof(struct vlan_ethhdr, h_vlan_proto):
+ if (len == 2) {
+ vlan_proto = nft_reg_load_be16(src);
+ skb->vlan_proto = vlan_proto;
+ } else if (len == 4) {
+ vlanh = (struct nft_payload_vlan_hdr *)src;
+ __vlan_hwaccel_put_tag(skb, vlanh->h_vlan_proto,
+ ntohs(vlanh->h_vlan_TCI));
+ } else {
+ return false;
+ }
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_TCI):
+ if (len != 2)
+ return false;
+
+ vlan_tci = ntohs(nft_reg_load_be16(src));
+ skb->vlan_tci = vlan_tci;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_payload_set *priv = nft_expr_priv(expr);
- struct sk_buff *skb = pkt->skb;
const u32 *src = &regs->data[priv->sreg];
- int offset, csum_offset;
+ int offset, csum_offset, vlan_hlen = 0;
+ struct sk_buff *skb = pkt->skb;
__wsum fsum, tsum;
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
if (!skb_mac_header_was_set(skb))
goto err;
- offset = skb_mac_header(skb) - skb->data;
+
+ if (skb_vlan_tag_present(skb) &&
+ nft_payload_need_vlan_adjust(priv->offset, priv->len)) {
+ if (!nft_payload_set_vlan(src, skb,
+ priv->offset, priv->len,
+ &vlan_hlen))
+ goto err;
+
+ if (!vlan_hlen)
+ return;
+ }
+
+ offset = skb_mac_header(skb) - skb->data - vlan_hlen;
break;
case NFT_PAYLOAD_NETWORK_HEADER:
offset = skb_network_offset(skb);
@@ -839,6 +904,9 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
((priv->base != NFT_PAYLOAD_TRANSPORT_HEADER &&
priv->base != NFT_PAYLOAD_INNER_HEADER) ||
skb->ip_summed != CHECKSUM_PARTIAL)) {
+ if (offset + priv->len > skb->len)
+ goto err;
+
fsum = skb_checksum(skb, offset, priv->len, 0);
tsum = csum_partial(src, priv->len, 0);
@@ -872,14 +940,18 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
+ u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE;
struct nft_payload_set *priv = nft_expr_priv(expr);
- u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE;
int err;
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
- priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset);
+ if (err < 0)
+ return err;
+ priv->offset = offset;
+
if (tb[NFTA_PAYLOAD_CSUM_TYPE])
csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) {
@@ -916,7 +988,7 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
}
priv->csum_type = csum_type;
- return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg,
+ return nft_parse_register_load(ctx, tb[NFTA_PAYLOAD_SREG], &priv->sreg,
priv->len);
}
@@ -1001,7 +1073,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_PAYLOAD_DREG] == NULL)
return ERR_PTR(-EINVAL);
- err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset);
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset);
if (err < 0)
return ERR_PTR(err);
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index b2b8127c8d43..344fe311878f 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -69,8 +69,7 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr,
}
static int nft_queue_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN) |
@@ -136,7 +135,7 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx,
struct nft_queue *priv = nft_expr_priv(expr);
int err;
- err = nft_parse_register_load(tb[NFTA_QUEUE_SREG_QNUM],
+ err = nft_parse_register_load(ctx, tb[NFTA_QUEUE_SREG_QNUM],
&priv->sreg_qnum, sizeof(u32));
if (err < 0)
return err;
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 123578e28917..df0798da2329 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -19,10 +19,16 @@ struct nft_quota {
};
static inline bool nft_overquota(struct nft_quota *priv,
- const struct sk_buff *skb)
+ const struct sk_buff *skb,
+ bool *report)
{
- return atomic64_add_return(skb->len, priv->consumed) >=
- atomic64_read(&priv->quota);
+ u64 consumed = atomic64_add_return(skb->len, priv->consumed);
+ u64 quota = atomic64_read(&priv->quota);
+
+ if (report)
+ *report = consumed >= quota;
+
+ return consumed > quota;
}
static inline bool nft_quota_invert(struct nft_quota *priv)
@@ -34,7 +40,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv))
+ if (nft_overquota(priv, pkt->skb, NULL) ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
}
@@ -51,13 +57,13 @@ static void nft_quota_obj_eval(struct nft_object *obj,
const struct nft_pktinfo *pkt)
{
struct nft_quota *priv = nft_obj_data(obj);
- bool overquota;
+ bool overquota, report;
- overquota = nft_overquota(priv, pkt->skb);
+ overquota = nft_overquota(priv, pkt->skb, &report);
if (overquota ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
- if (overquota &&
+ if (report &&
!test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0,
NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC);
@@ -233,15 +239,19 @@ static void nft_quota_destroy(const struct nft_ctx *ctx,
return nft_quota_do_destroy(ctx, priv);
}
-static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_quota *priv_dst = nft_expr_priv(dst);
+ struct nft_quota *priv_src = nft_expr_priv(src);
+
+ priv_dst->quota = priv_src->quota;
+ priv_dst->flags = priv_src->flags;
- priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC);
+ priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp);
if (!priv_dst->consumed)
return -ENOMEM;
- atomic64_set(priv_dst->consumed, 0);
+ *priv_dst->consumed = *priv_src->consumed;
return 0;
}
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 0566d6aaf1e5..ea382f7bbd78 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -42,7 +42,7 @@ void nft_range_eval(const struct nft_expr *expr,
static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = {
[NFTA_RANGE_SREG] = { .type = NLA_U32 },
- [NFTA_RANGE_OP] = { .type = NLA_U32 },
+ [NFTA_RANGE_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED },
[NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED },
};
@@ -83,7 +83,7 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
goto err2;
}
- err = nft_parse_register_load(tb[NFTA_RANGE_SREG], &priv->sreg,
+ err = nft_parse_register_load(ctx, tb[NFTA_RANGE_SREG], &priv->sreg,
desc_from.len);
if (err < 0)
goto err2;
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 5f7739987559..95eedad85c83 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -22,12 +22,12 @@ struct nft_redir {
static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = {
[NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 },
[NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 },
- [NFTA_REDIR_FLAGS] = { .type = NLA_U32 },
+ [NFTA_REDIR_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK),
};
static int nft_redir_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
int err;
@@ -48,15 +48,15 @@ static int nft_redir_init(const struct nft_ctx *ctx,
unsigned int plen;
int err;
- plen = sizeof_field(struct nf_nat_range, min_addr.all);
+ plen = sizeof_field(struct nf_nat_range, min_proto.all);
if (tb[NFTA_REDIR_REG_PROTO_MIN]) {
- err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN],
+ err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MIN],
&priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_REDIR_REG_PROTO_MAX]) {
- err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MAX],
+ err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MAX],
&priv->sreg_proto_max,
plen);
if (err < 0)
@@ -64,13 +64,12 @@ static int nft_redir_init(const struct nft_ctx *ctx,
} else {
priv->sreg_proto_max = priv->sreg_proto_min;
}
+
+ priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
- if (tb[NFTA_REDIR_FLAGS]) {
+ if (tb[NFTA_REDIR_FLAGS])
priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EINVAL;
- }
return nf_ct_netns_get(ctx->net, ctx->family);
}
@@ -99,25 +98,37 @@ nla_put_failure:
return -1;
}
-static void nft_redir_ipv4_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static void nft_redir_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
- struct nft_redir *priv = nft_expr_priv(expr);
- struct nf_nat_ipv4_multi_range_compat mr;
+ const struct nft_redir *priv = nft_expr_priv(expr);
+ struct nf_nat_range2 range;
- memset(&mr, 0, sizeof(mr));
+ memset(&range, 0, sizeof(range));
+ range.flags = priv->flags;
if (priv->sreg_proto_min) {
- mr.range[0].min.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- mr.range[0].max.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_max]);
}
- mr.range[0].flags |= priv->flags;
-
- regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range,
+ nft_hook(pkt));
+ break;
+#ifdef CONFIG_NF_TABLES_IPV6
+ case NFPROTO_IPV6:
+ regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range,
+ nft_hook(pkt));
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
}
static void
@@ -130,7 +141,7 @@ static struct nft_expr_type nft_redir_ipv4_type;
static const struct nft_expr_ops nft_redir_ipv4_ops = {
.type = &nft_redir_ipv4_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
- .eval = nft_redir_ipv4_eval,
+ .eval = nft_redir_eval,
.init = nft_redir_init,
.destroy = nft_redir_ipv4_destroy,
.dump = nft_redir_dump,
@@ -148,28 +159,6 @@ static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
};
#ifdef CONFIG_NF_TABLES_IPV6
-static void nft_redir_ipv6_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_redir *priv = nft_expr_priv(expr);
- struct nf_nat_range2 range;
-
- memset(&range, 0, sizeof(range));
- if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
- }
-
- range.flags |= priv->flags;
-
- regs->verdict.code =
- nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt));
-}
-
static void
nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
@@ -180,7 +169,7 @@ static struct nft_expr_type nft_redir_ipv6_type;
static const struct nft_expr_ops nft_redir_ipv6_ops = {
.type = &nft_redir_ipv6_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
- .eval = nft_redir_ipv6_eval,
+ .eval = nft_redir_eval,
.init = nft_redir_init,
.destroy = nft_redir_ipv6_destroy,
.dump = nft_redir_dump,
@@ -199,20 +188,6 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
#endif
#ifdef CONFIG_NF_TABLES_INET
-static void nft_redir_inet_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- switch (nft_pf(pkt)) {
- case NFPROTO_IPV4:
- return nft_redir_ipv4_eval(expr, regs, pkt);
- case NFPROTO_IPV6:
- return nft_redir_ipv6_eval(expr, regs, pkt);
- }
-
- WARN_ON_ONCE(1);
-}
-
static void
nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
@@ -223,7 +198,7 @@ static struct nft_expr_type nft_redir_inet_type;
static const struct nft_expr_ops nft_redir_inet_ops = {
.type = &nft_redir_inet_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
- .eval = nft_redir_inet_eval,
+ .eval = nft_redir_eval,
.init = nft_redir_init,
.destroy = nft_redir_inet_destroy,
.dump = nft_redir_dump,
@@ -236,7 +211,7 @@ static struct nft_expr_type nft_redir_inet_type __read_mostly = {
.name = "redir",
.ops = &nft_redir_inet_ops,
.policy = nft_redir_policy,
- .maxattr = NFTA_MASQ_MAX,
+ .maxattr = NFTA_REDIR_MAX,
.owner = THIS_MODULE,
};
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index f2addc844dd2..196a92c7ea09 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -18,14 +18,13 @@
#include <linux/icmpv6.h>
const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
- [NFTA_REJECT_TYPE] = { .type = NLA_U32 },
+ [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 },
};
EXPORT_SYMBOL_GPL(nft_reject_policy);
int nft_reject_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
return nft_chain_validate_hooks(ctx->chain,
(1 << NF_INET_LOCAL_IN) |
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 973fa31a9dd6..49020e67304a 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -61,8 +61,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
}
static int nft_reject_inet_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
return nft_chain_validate_hooks(ctx->chain,
(1 << NF_INET_LOCAL_IN) |
diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c
index 7865cd8b11bb..2558ce1505d9 100644
--- a/net/netfilter/nft_reject_netdev.c
+++ b/net/netfilter/nft_reject_netdev.c
@@ -145,8 +145,7 @@ out:
}
static int nft_reject_netdev_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS));
}
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 5990fdd7b3cc..dc50b9a5bd68 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -73,14 +73,14 @@ void nft_rt_get_eval(const struct nft_expr *expr,
if (nft_pf(pkt) != NFPROTO_IPV4)
goto err;
- *dest = (__force u32)rt_nexthop((const struct rtable *)dst,
+ *dest = (__force u32)rt_nexthop(dst_rtable(dst),
ip_hdr(skb)->daddr);
break;
case NFT_RT_NEXTHOP6:
if (nft_pf(pkt) != NFPROTO_IPV6)
goto err;
- memcpy(dest, rt6_nexthop((struct rt6_info *)dst,
+ memcpy(dest, rt6_nexthop(dst_rt6_info(dst),
&ipv6_hdr(skb)->daddr),
sizeof(struct in6_addr));
break;
@@ -104,7 +104,7 @@ err:
static const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = {
[NFTA_RT_DREG] = { .type = NLA_U32 },
- [NFTA_RT_KEY] = { .type = NLA_U32 },
+ [NFTA_RT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_rt_get_init(const struct nft_ctx *ctx,
@@ -160,12 +160,16 @@ nla_put_failure:
return -1;
}
-static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
const struct nft_rt *priv = nft_expr_priv(expr);
unsigned int hooks;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
switch (priv->key) {
case NFT_RT_NEXTHOP4:
case NFT_RT_NEXTHOP6:
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 96081ac8d2b4..8d3f040a904a 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -13,6 +13,7 @@
#include <net/netfilter/nf_tables_core.h>
struct nft_bitmap_elem {
+ struct nft_elem_priv priv;
struct list_head head;
struct nft_set_ext ext;
};
@@ -74,26 +75,33 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
}
INDIRECT_CALLABLE_SCOPE
-bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
const struct nft_bitmap *priv = nft_set_priv(set);
+ static const struct nft_set_ext found;
u8 genmask = nft_genmask_cur(net);
u32 idx, off;
nft_bitmap_location(set, key, &idx, &off);
- return nft_bitmap_active(priv->bitmap, idx, off, genmask);
+ if (nft_bitmap_active(priv->bitmap, idx, off, genmask))
+ return &found;
+
+ return NULL;
}
static struct nft_bitmap_elem *
-nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
+nft_bitmap_elem_find(const struct net *net,
+ const struct nft_set *set, struct nft_bitmap_elem *this,
u8 genmask)
{
const struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be;
- list_for_each_entry_rcu(be, &priv->list, head) {
+ list_for_each_entry_rcu(be, &priv->list, head,
+ lockdep_is_held(&nft_pernet(net)->commit_mutex)) {
if (memcmp(nft_set_ext_key(&be->ext),
nft_set_ext_key(&this->ext), set->klen) ||
!nft_set_elem_active(&be->ext, genmask))
@@ -104,8 +112,9 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
return NULL;
}
-static void *nft_bitmap_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_bitmap_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
const struct nft_bitmap *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -116,23 +125,23 @@ static void *nft_bitmap_get(const struct net *net, const struct nft_set *set,
!nft_set_elem_active(&be->ext, genmask))
continue;
- return be;
+ return &be->priv;
}
return ERR_PTR(-ENOENT);
}
static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be;
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *new = elem->priv, *be;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
- be = nft_bitmap_elem_find(set, new, genmask);
+ be = nft_bitmap_elem_find(net, set, new, genmask);
if (be) {
- *ext = &be->ext;
+ *elem_priv = &be->priv;
return -EEXIST;
}
@@ -144,12 +153,11 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
return 0;
}
-static void nft_bitmap_remove(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static void nft_bitmap_remove(const struct net *net, const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *be = elem->priv;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
@@ -161,47 +169,46 @@ static void nft_bitmap_remove(const struct net *net,
static void nft_bitmap_activate(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *be = elem->priv;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
/* Enter 11 state. */
priv->bitmap[idx] |= (genmask << off);
- nft_set_elem_change_active(net, set, &be->ext);
+ nft_clear(net, &be->ext);
}
-static bool nft_bitmap_flush(const struct net *net,
- const struct nft_set *set, void *_be)
+static void nft_bitmap_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
struct nft_bitmap *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
- struct nft_bitmap_elem *be = _be;
u32 idx, off;
nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
/* Enter 10 state, similar to deactivation. */
priv->bitmap[idx] &= ~(genmask << off);
nft_set_elem_change_active(net, set, &be->ext);
-
- return true;
}
-static void *nft_bitmap_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_bitmap_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_bitmap_elem *this = nft_elem_priv_cast(elem->priv), *be;
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *this = elem->priv, *be;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
nft_bitmap_location(set, elem->key.val.data, &idx, &off);
- be = nft_bitmap_elem_find(set, this, genmask);
+ be = nft_bitmap_elem_find(net, set, this, genmask);
if (!be)
return NULL;
@@ -209,7 +216,7 @@ static void *nft_bitmap_deactivate(const struct net *net,
priv->bitmap[idx] &= ~(genmask << off);
nft_set_elem_change_active(net, set, &be->ext);
- return be;
+ return &be->priv;
}
static void nft_bitmap_walk(const struct nft_ctx *ctx,
@@ -218,17 +225,13 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx,
{
const struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be;
- struct nft_set_elem elem;
- list_for_each_entry_rcu(be, &priv->list, head) {
+ list_for_each_entry_rcu(be, &priv->list, head,
+ lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) {
if (iter->count < iter->skip)
goto cont;
- if (!nft_set_elem_active(&be->ext, iter->genmask))
- goto cont;
- elem.priv = be;
-
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &be->priv);
if (iter->err < 0)
return;
@@ -265,19 +268,22 @@ static int nft_bitmap_init(const struct nft_set *set,
{
struct nft_bitmap *priv = nft_set_priv(set);
+ BUILD_BUG_ON(offsetof(struct nft_bitmap_elem, priv) != 0);
+
INIT_LIST_HEAD(&priv->list);
priv->bitmap_size = nft_bitmap_size(set->klen);
return 0;
}
-static void nft_bitmap_destroy(const struct nft_set *set)
+static void nft_bitmap_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be, *n;
list_for_each_entry_safe(be, n, &priv->list, head)
- nft_set_elem_destroy(set, be, true);
+ nf_tables_set_elem_destroy(ctx, set, &be->priv);
}
static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 76de6c8d9865..ba01ce75d6de 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -24,10 +24,14 @@
struct nft_rhash {
struct rhashtable ht;
struct delayed_work gc_work;
+ u32 wq_gc_seq;
};
struct nft_rhash_elem {
+ struct nft_elem_priv priv;
struct rhash_head node;
+ struct llist_node walk_node;
+ u32 wq_gc_seq;
struct nft_set_ext ext;
};
@@ -35,6 +39,7 @@ struct nft_rhash_cmp_arg {
const struct nft_set *set;
const u32 *key;
u8 genmask;
+ u64 tstamp;
};
static inline u32 nft_rhash_key(const void *data, u32 len, u32 seed)
@@ -59,7 +64,9 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg,
if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
return 1;
- if (nft_set_elem_expired(&he->ext))
+ if (nft_set_elem_is_dead(&he->ext))
+ return 1;
+ if (__nft_set_elem_expired(&he->ext, x->tstamp))
return 1;
if (!nft_set_elem_active(&he->ext, x->genmask))
return 1;
@@ -75,8 +82,9 @@ static const struct rhashtable_params nft_rhash_params = {
};
INDIRECT_CALLABLE_SCOPE
-bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_rhash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_rhash *priv = nft_set_priv(set);
const struct nft_rhash_elem *he;
@@ -84,17 +92,19 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
.genmask = nft_genmask_cur(net),
.set = set,
.key = key,
+ .tstamp = get_jiffies_64(),
};
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
- *ext = &he->ext;
+ return &he->ext;
- return !!he;
+ return NULL;
}
-static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_rhash_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_elem *he;
@@ -102,39 +112,40 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
.genmask = nft_genmask_cur(net),
.set = set,
.key = elem->key.val.data,
+ .tstamp = get_jiffies_64(),
};
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
- return he;
+ return &he->priv;
return ERR_PTR(-ENOENT);
}
-static bool nft_rhash_update(struct nft_set *set, const u32 *key,
- void *(*new)(struct nft_set *,
- const struct nft_expr *,
- struct nft_regs *regs),
- const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_set_ext **ext)
+static const struct nft_set_ext *
+nft_rhash_update(struct nft_set *set, const u32 *key,
+ const struct nft_expr *expr, struct nft_regs *regs)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_elem *he, *prev;
+ struct nft_elem_priv *elem_priv;
struct nft_rhash_cmp_arg arg = {
.genmask = NFT_GENMASK_ANY,
.set = set,
.key = key,
+ .tstamp = get_jiffies_64(),
};
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
goto out;
- he = new(set, expr, regs);
- if (he == NULL)
+ elem_priv = nft_dynset_new(set, expr, regs);
+ if (!elem_priv)
goto err1;
+ he = nft_elem_priv_cast(elem_priv);
+ init_llist_node(&he->walk_node);
prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
nft_rhash_params);
if (IS_ERR(prev))
@@ -142,71 +153,67 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
/* Another cpu may race to insert the element with the same key */
if (prev) {
- nft_set_elem_destroy(set, he, true);
+ nft_set_elem_destroy(set, &he->priv, true);
atomic_dec(&set->nelems);
he = prev;
}
out:
- *ext = &he->ext;
- return true;
+ return &he->ext;
err2:
- nft_set_elem_destroy(set, he, true);
+ nft_set_elem_destroy(set, &he->priv, true);
atomic_dec(&set->nelems);
err1:
- return false;
+ return NULL;
}
static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv);
struct nft_rhash *priv = nft_set_priv(set);
- struct nft_rhash_elem *he = elem->priv;
struct nft_rhash_cmp_arg arg = {
.genmask = nft_genmask_next(net),
.set = set,
.key = elem->key.val.data,
+ .tstamp = nft_net_tstamp(net),
};
struct nft_rhash_elem *prev;
+ init_llist_node(&he->walk_node);
prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
nft_rhash_params);
if (IS_ERR(prev))
return PTR_ERR(prev);
if (prev) {
- *ext = &prev->ext;
+ *elem_priv = &prev->priv;
return -EEXIST;
}
return 0;
}
static void nft_rhash_activate(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rhash_elem *he = elem->priv;
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
- nft_set_elem_change_active(net, set, &he->ext);
- nft_set_elem_clear_busy(&he->ext);
+ nft_clear(net, &he->ext);
}
-static bool nft_rhash_flush(const struct net *net,
- const struct nft_set *set, void *priv)
+static void nft_rhash_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rhash_elem *he = priv;
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
- if (!nft_set_elem_mark_busy(&he->ext) ||
- !nft_is_active(net, &he->ext)) {
- nft_set_elem_change_active(net, set, &he->ext);
- return true;
- }
- return false;
+ nft_set_elem_change_active(net, set, &he->ext);
}
-static void *nft_rhash_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_rhash_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_elem *he;
@@ -214,25 +221,25 @@ static void *nft_rhash_deactivate(const struct net *net,
.genmask = nft_genmask_next(net),
.set = set,
.key = elem->key.val.data,
+ .tstamp = nft_net_tstamp(net),
};
rcu_read_lock();
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
- if (he != NULL &&
- !nft_rhash_flush(net, set, he))
- he = NULL;
+ if (he)
+ nft_set_elem_change_active(net, set, &he->ext);
rcu_read_unlock();
- return he;
+ return &he->priv;
}
static void nft_rhash_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
struct nft_rhash *priv = nft_set_priv(set);
- struct nft_rhash_elem *he = elem->priv;
rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
}
@@ -252,16 +259,17 @@ static bool nft_rhash_delete(const struct nft_set *set,
if (he == NULL)
return false;
- return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0;
+ nft_set_elem_dead(&he->ext);
+
+ return true;
}
-static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
{
struct nft_rhash *priv = nft_set_priv(set);
- struct nft_rhash_elem *he;
struct rhashtable_iter hti;
- struct nft_set_elem elem;
+ struct nft_rhash_elem *he;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
@@ -278,14 +286,8 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
if (iter->count < iter->skip)
goto cont;
- if (nft_set_elem_expired(&he->ext))
- goto cont;
- if (!nft_set_elem_active(&he->ext, iter->genmask))
- goto cont;
- elem.priv = he;
-
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
if (iter->err < 0)
break;
@@ -296,6 +298,97 @@ cont:
rhashtable_walk_exit(&hti);
}
+static void nft_rhash_walk_update(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_rhash *priv = nft_set_priv(set);
+ struct nft_rhash_elem *he, *tmp;
+ struct llist_node *first_node;
+ struct rhashtable_iter hti;
+ LLIST_HEAD(walk_list);
+
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+
+ if (set->in_update_walk) {
+ /* This can happen with bogus rulesets during ruleset validation
+ * when a verdict map causes a jump back to the same map.
+ *
+ * Without this extra check the walk_next loop below will see
+ * elems on the callers walk_list and skip (not validate) them.
+ */
+ iter->err = -EMLINK;
+ return;
+ }
+
+ /* walk happens under RCU.
+ *
+ * We create a snapshot list so ->iter callback can sleep.
+ * commit_mutex is held, elements can ...
+ * .. be added in parallel from dataplane (dynset)
+ * .. be marked as dead in parallel from dataplane (dynset).
+ * .. be queued for removal in parallel (gc timeout).
+ * .. not be freed: transaction mutex is held.
+ */
+ rhashtable_walk_enter(&priv->ht, &hti);
+ rhashtable_walk_start(&hti);
+
+ while ((he = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(he)) {
+ if (PTR_ERR(he) != -EAGAIN) {
+ iter->err = PTR_ERR(he);
+ break;
+ }
+
+ continue;
+ }
+
+ /* rhashtable resized during walk, skip */
+ if (llist_on_list(&he->walk_node))
+ continue;
+
+ llist_add(&he->walk_node, &walk_list);
+ }
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ first_node = __llist_del_all(&walk_list);
+ set->in_update_walk = true;
+ llist_for_each_entry_safe(he, tmp, first_node, walk_node) {
+ if (iter->err == 0) {
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
+ if (iter->err == 0)
+ iter->count++;
+ }
+
+ /* all entries must be cleared again, else next ->walk iteration
+ * will skip entries.
+ */
+ init_llist_node(&he->walk_node);
+ }
+ set->in_update_walk = false;
+}
+
+static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ /* only relevant for netlink dumps which use READ type */
+ WARN_ON_ONCE(iter->skip != 0);
+
+ nft_rhash_walk_update(ctx, set, iter);
+ break;
+ case NFT_ITER_READ:
+ nft_rhash_walk_ro(ctx, set, iter);
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
struct nft_set_ext *ext)
{
@@ -305,7 +398,8 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
nft_setelem_expr_foreach(expr, elem_expr, size) {
if (expr->ops->gc &&
- expr->ops->gc(read_pnet(&set->net), expr))
+ expr->ops->gc(read_pnet(&set->net), expr) &&
+ set->flags & NFT_SET_EVAL)
return true;
}
@@ -314,25 +408,60 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
static void nft_rhash_gc(struct work_struct *work)
{
+ struct nftables_pernet *nft_net;
struct nft_set *set;
struct nft_rhash_elem *he;
struct nft_rhash *priv;
- struct nft_set_gc_batch *gcb = NULL;
struct rhashtable_iter hti;
+ struct nft_trans_gc *gc;
+ struct net *net;
+ u32 gc_seq;
priv = container_of(work, struct nft_rhash, gc_work.work);
set = nft_set_container_of(priv);
+ net = read_pnet(&set->net);
+ nft_net = nft_pernet(net);
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+
+ if (nft_set_gc_is_pending(set))
+ goto done;
+
+ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
+ if (!gc)
+ goto done;
+
+ /* Elements never collected use a zero gc worker sequence number. */
+ if (unlikely(++priv->wq_gc_seq == 0))
+ priv->wq_gc_seq++;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) {
- if (PTR_ERR(he) != -EAGAIN)
- break;
- continue;
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
+ }
+
+ /* Ruleset has been updated, try later. */
+ if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
}
+ /* rhashtable walk is unstable, already seen in this gc run?
+ * Then, skip this element. In case of (unlikely) sequence
+ * wraparound and stale element wq_gc_seq, next gc run will
+ * just find this expired element.
+ */
+ if (he->wq_gc_seq == priv->wq_gc_seq)
+ continue;
+
+ if (nft_set_elem_is_dead(&he->ext))
+ goto dead_elem;
+
if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) &&
nft_rhash_expr_needs_gc_run(set, &he->ext))
goto needs_gc_run;
@@ -340,26 +469,28 @@ static void nft_rhash_gc(struct work_struct *work)
if (!nft_set_elem_expired(&he->ext))
continue;
needs_gc_run:
- if (nft_set_elem_mark_busy(&he->ext))
- continue;
-
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb == NULL)
- break;
- rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, he);
+ nft_set_elem_dead(&he->ext);
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ goto try_later;
+
+ /* annotate gc sequence for this attempt. */
+ he->wq_gc_seq = priv->wq_gc_seq;
+ nft_trans_gc_elem_add(gc, he);
}
+
+ gc = nft_trans_gc_catchall_async(gc, gc_seq);
+
+try_later:
+ /* catchall list iteration requires rcu read side lock. */
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
- he = nft_set_catchall_gc(set);
- if (he) {
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb)
- nft_set_gc_batch_add(gcb, he);
- }
- nft_set_gc_batch_complete(gcb);
+ if (gc)
+ nft_trans_gc_queue_async_done(gc);
+
+done:
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
}
@@ -386,6 +517,8 @@ static int nft_rhash_init(const struct nft_set *set,
struct rhashtable_params params = nft_rhash_params;
int err;
+ BUILD_BUG_ON(offsetof(struct nft_rhash_elem, priv) != 0);
+
params.nelem_hint = desc->size ?: NFT_RHASH_ELEMENT_HINT;
params.key_len = set->klen;
@@ -394,25 +527,37 @@ static int nft_rhash_init(const struct nft_set *set,
return err;
INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
- if (set->flags & NFT_SET_TIMEOUT)
+ if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL))
nft_rhash_gc_init(set);
return 0;
}
+struct nft_rhash_ctx {
+ const struct nft_ctx ctx;
+ const struct nft_set *set;
+};
+
static void nft_rhash_elem_destroy(void *ptr, void *arg)
{
- nft_set_elem_destroy(arg, ptr, true);
+ struct nft_rhash_ctx *rhash_ctx = arg;
+ struct nft_rhash_elem *he = ptr;
+
+ nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, &he->priv);
}
-static void nft_rhash_destroy(const struct nft_set *set)
+static void nft_rhash_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_rhash *priv = nft_set_priv(set);
+ struct nft_rhash_ctx rhash_ctx = {
+ .ctx = *ctx,
+ .set = set,
+ };
cancel_delayed_work_sync(&priv->gc_work);
- rcu_barrier();
rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
- (void *)set);
+ (void *)&rhash_ctx);
}
/* Number of buckets is stored in u32, so cap our result to 1U<<31 */
@@ -445,13 +590,15 @@ struct nft_hash {
};
struct nft_hash_elem {
+ struct nft_elem_priv priv;
struct hlist_node node;
struct nft_set_ext ext;
};
INDIRECT_CALLABLE_SCOPE
-bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_hash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -462,16 +609,15 @@ bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
hash = reciprocal_scale(hash, priv->buckets);
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) &&
- nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
- return true;
- }
+ nft_set_elem_active(&he->ext, genmask))
+ return &he->ext;
}
- return false;
+ return NULL;
}
-static void *nft_hash_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_hash_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -483,15 +629,15 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set,
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) &&
nft_set_elem_active(&he->ext, genmask))
- return he;
+ return &he->priv;
}
return ERR_PTR(-ENOENT);
}
INDIRECT_CALLABLE_SCOPE
-bool nft_hash_lookup_fast(const struct net *net,
- const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_hash_lookup_fast(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -504,12 +650,10 @@ bool nft_hash_lookup_fast(const struct net *net,
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
k2 = *(u32 *)nft_set_ext_key(&he->ext)->data;
if (k1 == k2 &&
- nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
- return true;
- }
+ nft_set_elem_active(&he->ext, genmask))
+ return &he->ext;
}
- return false;
+ return NULL;
}
static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv,
@@ -531,9 +675,9 @@ static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv,
static int nft_hash_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
- struct nft_hash_elem *this = elem->priv, *he;
+ struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he;
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
u32 hash;
@@ -543,7 +687,7 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set,
if (!memcmp(nft_set_ext_key(&this->ext),
nft_set_ext_key(&he->ext), set->klen) &&
nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
+ *elem_priv = &he->priv;
return -EEXIST;
}
}
@@ -552,28 +696,28 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set,
}
static void nft_hash_activate(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_hash_elem *he = elem->priv;
+ struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
- nft_set_elem_change_active(net, set, &he->ext);
+ nft_clear(net, &he->ext);
}
-static bool nft_hash_flush(const struct net *net,
- const struct nft_set *set, void *priv)
+static void nft_hash_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_hash_elem *he = priv;
+ struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
nft_set_elem_change_active(net, set, &he->ext);
- return true;
}
-static void *nft_hash_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_hash_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he;
struct nft_hash *priv = nft_set_priv(set);
- struct nft_hash_elem *this = elem->priv, *he;
u8 genmask = nft_genmask_next(net);
u32 hash;
@@ -583,7 +727,7 @@ static void *nft_hash_deactivate(const struct net *net,
set->klen) &&
nft_set_elem_active(&he->ext, genmask)) {
nft_set_elem_change_active(net, set, &he->ext);
- return he;
+ return &he->priv;
}
}
return NULL;
@@ -591,9 +735,9 @@ static void *nft_hash_deactivate(const struct net *net,
static void nft_hash_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_hash_elem *he = elem->priv;
+ struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
hlist_del_rcu(&he->node);
}
@@ -603,19 +747,15 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
{
struct nft_hash *priv = nft_set_priv(set);
struct nft_hash_elem *he;
- struct nft_set_elem elem;
int i;
for (i = 0; i < priv->buckets; i++) {
- hlist_for_each_entry_rcu(he, &priv->table[i], node) {
+ hlist_for_each_entry_rcu(he, &priv->table[i], node,
+ lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) {
if (iter->count < iter->skip)
goto cont;
- if (!nft_set_elem_active(&he->ext, iter->genmask))
- goto cont;
-
- elem.priv = he;
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
if (iter->err < 0)
return;
cont:
@@ -643,7 +783,8 @@ static int nft_hash_init(const struct nft_set *set,
return 0;
}
-static void nft_hash_destroy(const struct nft_set *set)
+static void nft_hash_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_hash *priv = nft_set_priv(set);
struct nft_hash_elem *he;
@@ -653,7 +794,7 @@ static void nft_hash_destroy(const struct nft_set *set)
for (i = 0; i < priv->buckets; i++) {
hlist_for_each_entry_safe(he, next, &priv->table[i], node) {
hlist_del_rcu(&he->node);
- nft_set_elem_destroy(set, he, true);
+ nf_tables_set_elem_destroy(ctx, set, &he->priv);
}
}
}
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 06d46d182634..112fe46788b6 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -342,9 +342,6 @@
#include "nft_set_pipapo_avx2.h"
#include "nft_set_pipapo.h"
-/* Current working bitmap index, toggled between field matches */
-static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index);
-
/**
* pipapo_refill() - For each set bit, set bits from selected mapping table item
* @map: Bitmap to be scanned for set bits
@@ -362,11 +359,13 @@ static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index);
*
* Return: -1 on no match, bit position on 'match_only', 0 otherwise.
*/
-int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
- union nft_pipapo_map_bucket *mt, bool match_only)
+int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
+ unsigned long *dst,
+ const union nft_pipapo_map_bucket *mt, bool match_only)
{
unsigned long bitset;
- int k, ret = -1;
+ unsigned int k;
+ int ret = -1;
for (k = 0; k < len; k++) {
bitset = map[k];
@@ -398,41 +397,47 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
}
/**
- * nft_pipapo_lookup() - Lookup function
- * @net: Network namespace
- * @set: nftables API set representation
- * @key: nftables API element representation containing key data
- * @ext: nftables API extension pointer, filled with matching reference
+ * pipapo_get_slow() - Get matching element reference given key data
+ * @m: storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: timestamp to check for expired elements
*
* For more details, see DOC: Theory of Operation.
*
- * Return: true on match, false otherwise.
+ * This is the main lookup function. It matches key data against either
+ * the working match set or the uncommitted copy, depending on what the
+ * caller passed to us.
+ * nft_pipapo_get (lookup from userspace/control plane) and nft_pipapo_lookup
+ * (datapath lookup) pass the active copy.
+ * The insertion path will pass the uncommitted working copy.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
- struct nft_pipapo *priv = nft_set_priv(set);
- unsigned long *res_map, *fill_map;
- u8 genmask = nft_genmask_cur(net);
- const u8 *rp = (const u8 *)key;
- struct nft_pipapo_match *m;
- struct nft_pipapo_field *f;
+ unsigned long *res_map, *fill_map, *map;
+ struct nft_pipapo_scratch *scratch;
+ const struct nft_pipapo_field *f;
bool map_index;
int i;
local_bh_disable();
- map_index = raw_cpu_read(nft_pipapo_scratch_index);
-
- m = rcu_dereference(priv->match);
-
- if (unlikely(!m || !*raw_cpu_ptr(m->scratch)))
+ scratch = *raw_cpu_ptr(m->scratch);
+ if (unlikely(!scratch))
goto out;
+ __local_lock_nested_bh(&scratch->bh_lock);
+
+ map_index = scratch->map_index;
- res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0);
- fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max);
+ map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
+ res_map = map + (map_index ? m->bsize_max : 0);
+ fill_map = map + (map_index ? 0 : m->bsize_max);
- memset(res_map, 0xff, m->bsize_max * sizeof(*res_map));
+ pipapo_resmap_init(m, res_map);
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1;
@@ -442,12 +447,12 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
* packet bytes value, then AND bucket value
*/
if (likely(f->bb == 8))
- pipapo_and_field_buckets_8bit(f, res_map, rp);
+ pipapo_and_field_buckets_8bit(f, res_map, data);
else
- pipapo_and_field_buckets_4bit(f, res_map, rp);
+ pipapo_and_field_buckets_4bit(f, res_map, data);
NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
- rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
+ data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
/* Now populate the bitmap for the next field, unless this is
* the last field, in which case return the matched 'ext'
@@ -460,16 +465,19 @@ next_match:
b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt,
last);
if (b < 0) {
- raw_cpu_write(nft_pipapo_scratch_index, map_index);
+ scratch->map_index = map_index;
+ __local_unlock_nested_bh(&scratch->bh_lock);
local_bh_enable();
- return false;
+ return NULL;
}
if (last) {
- *ext = &f->mt[b].e->ext;
- if (unlikely(nft_set_elem_expired(*ext) ||
- !nft_set_elem_active(*ext, genmask)))
+ struct nft_pipapo_elem *e;
+
+ e = f->mt[b].e;
+ if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) ||
+ !nft_set_elem_active(&e->ext, genmask)))
goto next_match;
/* Last field: we're just returning the key without
@@ -477,10 +485,10 @@ next_match:
* current inactive bitmap is clean and can be reused as
* *next* bitmap (not initial) for the next packet.
*/
- raw_cpu_write(nft_pipapo_scratch_index, map_index);
+ scratch->map_index = map_index;
+ __local_unlock_nested_bh(&scratch->bh_lock);
local_bh_enable();
-
- return true;
+ return e;
}
/* Swap bitmap indices: res_map is the initial bitmap for the
@@ -490,119 +498,202 @@ next_match:
map_index = !map_index;
swap(res_map, fill_map);
- rp += NFT_PIPAPO_GROUPS_PADDING(f);
+ data += NFT_PIPAPO_GROUPS_PADDING(f);
}
+ __local_unlock_nested_bh(&scratch->bh_lock);
out:
local_bh_enable();
- return false;
+ return NULL;
}
/**
* pipapo_get() - Get matching element reference given key data
- * @net: Network namespace
- * @set: nftables API set representation
+ * @m: Storage containing the set elements
* @data: Key data to be matched against existing elements
* @genmask: If set, check that element is active in given genmask
+ * @tstamp: Timestamp to check for expired elements
*
- * This is essentially the same as the lookup function, except that it matches
- * key data against the uncommitted copy and doesn't use preallocated maps for
- * bitmap results.
+ * This is a dispatcher function, either calling out the generic C
+ * implementation or, if available, the AVX2 one.
+ * This helper is only called from the control plane, with either RCU
+ * read lock or transaction mutex held.
*
- * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise.
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-static struct nft_pipapo_elem *pipapo_get(const struct net *net,
- const struct nft_set *set,
- const u8 *data, u8 genmask)
+static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
- struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT);
- struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *m = priv->clone;
- unsigned long *res_map, *fill_map = NULL;
- struct nft_pipapo_field *f;
- int i;
+ struct nft_pipapo_elem *e;
- res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
- if (!res_map) {
- ret = ERR_PTR(-ENOMEM);
- goto out;
- }
+ local_bh_disable();
- fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
- if (!fill_map) {
- ret = ERR_PTR(-ENOMEM);
- goto out;
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) {
+ e = pipapo_get_avx2(m, data, genmask, tstamp);
+ local_bh_enable();
+ return e;
}
+#endif
+ e = pipapo_get_slow(m, data, genmask, tstamp);
+ local_bh_enable();
+ return e;
+}
- memset(res_map, 0xff, m->bsize_max * sizeof(*res_map));
+/**
+ * nft_pipapo_lookup() - Dataplane fronted for main lookup function
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @key: pointer to nft registers containing key data
+ *
+ * This function is called from the data path. It will search for
+ * an element matching the given key in the current active copy.
+ * Unlike other set types, this uses 0 instead of nft_genmask_cur().
+ *
+ * This is because new (future) elements are not reachable from
+ * priv->match, they get added to priv->clone instead.
+ * When the commit phase flips the generation bitmask, the
+ * 'now old' entries are skipped but without the 'now current'
+ * elements becoming visible. Using nft_genmask_cur() thus creates
+ * inconsistent state: matching old entries get skipped but thew
+ * newly matching entries are unreachable.
+ *
+ * GENMASK_ANY doesn't work for the same reason: old-gen entries get
+ * skipped, new-gen entries are only reachable from priv->clone.
+ *
+ * nft_pipapo_commit swaps ->clone and ->match shortly after the
+ * genbit flip. As ->clone doesn't contain the old entries in the first
+ * place, lookup will only find the now-current ones.
+ *
+ * Return: ntables API extension pointer or NULL if no match.
+ */
+const struct nft_set_ext *
+nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+ const struct nft_pipapo_elem *e;
- nft_pipapo_for_each_field(f, i, m) {
- bool last = i == m->field_count - 1;
- int b;
+ m = rcu_dereference(priv->match);
+ e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64());
- /* For each bit group: select lookup table bucket depending on
- * packet bytes value, then AND bucket value
- */
- if (f->bb == 8)
- pipapo_and_field_buckets_8bit(f, res_map, data);
- else if (f->bb == 4)
- pipapo_and_field_buckets_4bit(f, res_map, data);
- else
- BUG();
+ return e ? &e->ext : NULL;
+}
- data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
+/**
+ * nft_pipapo_get() - Get matching element reference given key data
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ * @flags: Unused
+ *
+ * This function is called from the control plane path under
+ * RCU read lock.
+ *
+ * Return: set element private pointer or ERR_PTR(-ENOENT).
+ */
+static struct nft_elem_priv *
+nft_pipapo_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m = rcu_dereference(priv->match);
+ struct nft_pipapo_elem *e;
- /* Now populate the bitmap for the next field, unless this is
- * the last field, in which case return the matched 'ext'
- * pointer if any.
- *
- * Now res_map contains the matching bitmap, and fill_map is the
- * bitmap for the next field.
- */
-next_match:
- b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt,
- last);
- if (b < 0)
- goto out;
+ e = pipapo_get(m, (const u8 *)elem->key.val.data,
+ nft_genmask_cur(net), get_jiffies_64());
+ if (!e)
+ return ERR_PTR(-ENOENT);
- if (last) {
- if (nft_set_elem_expired(&f->mt[b].e->ext) ||
- (genmask &&
- !nft_set_elem_active(&f->mt[b].e->ext, genmask)))
- goto next_match;
+ return &e->priv;
+}
- ret = f->mt[b].e;
- goto out;
- }
+/**
+ * pipapo_realloc_mt() - Reallocate mapping table if needed upon resize
+ * @f: Field containing mapping table
+ * @old_rules: Amount of existing mapped rules
+ * @rules: Amount of new rules to map
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int pipapo_realloc_mt(struct nft_pipapo_field *f,
+ unsigned int old_rules, unsigned int rules)
+{
+ union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt;
+ const unsigned int extra = PAGE_SIZE / sizeof(*new_mt);
+ unsigned int rules_alloc = rules;
- data += NFT_PIPAPO_GROUPS_PADDING(f);
+ might_sleep();
- /* Swap bitmap indices: fill_map will be the initial bitmap for
- * the next field (i.e. the new res_map), and res_map is
- * guaranteed to be all-zeroes at this point, ready to be filled
- * according to the next mapping table.
- */
- swap(res_map, fill_map);
+ if (unlikely(rules == 0))
+ goto out_free;
+
+ /* growing and enough space left, no action needed */
+ if (rules > old_rules && f->rules_alloc > rules)
+ return 0;
+
+ /* downsize and extra slack has not grown too large */
+ if (rules < old_rules) {
+ unsigned int remove = f->rules_alloc - rules;
+
+ if (remove < (2u * extra))
+ return 0;
}
-out:
- kfree(fill_map);
- kfree(res_map);
- return ret;
+ /* If set needs more than one page of memory for rules then
+ * allocate another extra page to avoid frequent reallocation.
+ */
+ if (rules > extra &&
+ check_add_overflow(rules, extra, &rules_alloc))
+ return -EOVERFLOW;
+
+ if (rules_alloc > (INT_MAX / sizeof(*new_mt)))
+ return -ENOMEM;
+
+ new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT);
+ if (!new_mt)
+ return -ENOMEM;
+
+ if (old_mt)
+ memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt));
+
+ if (rules > old_rules) {
+ memset(new_mt + old_rules, 0,
+ (rules - old_rules) * sizeof(*new_mt));
+ }
+out_free:
+ f->rules_alloc = rules_alloc;
+ f->mt = new_mt;
+
+ kvfree(old_mt);
+
+ return 0;
}
+
/**
- * nft_pipapo_get() - Get matching element reference given key data
- * @net: Network namespace
- * @set: nftables API set representation
- * @elem: nftables API element representation containing key data
- * @flags: Unused
+ * lt_calculate_size() - Get storage size for lookup table with overflow check
+ * @groups: Amount of bit groups
+ * @bb: Number of bits grouped together in lookup table buckets
+ * @bsize: Size of each bucket in lookup table, in longs
+ *
+ * Return: allocation size including alignment overhead, negative on overflow
*/
-static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb,
+ unsigned int bsize)
{
- return pipapo_get(net, set, (const u8 *)elem->key.val.data,
- nft_genmask_cur(net));
+ ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long);
+
+ if (check_mul_overflow(ret, bsize, &ret))
+ return -1;
+ if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret))
+ return -1;
+ if (ret > INT_MAX)
+ return -1;
+
+ return ret;
}
/**
@@ -617,12 +708,16 @@ static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
*
* Return: 0 on success, -ENOMEM on allocation failure.
*/
-static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
+static int pipapo_resize(struct nft_pipapo_field *f,
+ unsigned int old_rules, unsigned int rules)
{
long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p;
- union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt;
- size_t new_bucket_size, copy;
- int group, bucket;
+ unsigned int new_bucket_size, copy;
+ int group, bucket, err;
+ ssize_t lt_size;
+
+ if (rules >= NFT_PIPAPO_RULE0_MAX)
+ return -ENOSPC;
new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG);
#ifdef NFT_PIPAPO_ALIGN
@@ -638,10 +733,11 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
else
copy = new_bucket_size;
- new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) *
- new_bucket_size * sizeof(*new_lt) +
- NFT_PIPAPO_ALIGN_HEADROOM,
- GFP_KERNEL);
+ lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size);
+ if (lt_size < 0)
+ return -ENOMEM;
+
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
if (!new_lt)
return -ENOMEM;
@@ -662,27 +758,18 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
}
mt:
- new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL);
- if (!new_mt) {
+ err = pipapo_realloc_mt(f, old_rules, rules);
+ if (err) {
kvfree(new_lt);
- return -ENOMEM;
- }
-
- memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt));
- if (rules > old_rules) {
- memset(new_mt + old_rules, 0,
- (rules - old_rules) * sizeof(*new_mt));
+ return err;
}
if (new_lt) {
f->bsize = new_bucket_size;
- NFT_PIPAPO_LT_ASSIGN(f, new_lt);
+ f->lt = new_lt;
kvfree(old_lt);
}
- f->mt = new_mt;
- kvfree(old_mt);
-
return 0;
}
@@ -833,9 +920,9 @@ static void pipapo_lt_8b_to_4b(int old_groups, int bsize,
*/
static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
{
+ unsigned int groups, bb;
unsigned long *new_lt;
- int groups, bb;
- size_t lt_size;
+ ssize_t lt_size;
lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize *
sizeof(*f->lt);
@@ -845,15 +932,17 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
groups = f->groups * 2;
bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET;
- lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
- sizeof(*f->lt);
+ lt_size = lt_calculate_size(groups, bb, f->bsize);
+ if (lt_size < 0)
+ return;
} else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET &&
lt_size < NFT_PIPAPO_LT_SIZE_LOW) {
groups = f->groups / 2;
bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET;
- lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
- sizeof(*f->lt);
+ lt_size = lt_calculate_size(groups, bb, f->bsize);
+ if (lt_size < 0)
+ return;
/* Don't increase group width if the resulting lookup table size
* would exceed the upper size threshold for a "small" set.
@@ -864,7 +953,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
return;
}
- new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL);
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
if (!new_lt)
return;
@@ -884,7 +973,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
f->groups = groups;
f->bb = bb;
kvfree(f->lt);
- NFT_PIPAPO_LT_ASSIGN(f, new_lt);
+ f->lt = new_lt;
}
/**
@@ -901,12 +990,14 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
int mask_bits)
{
- int rule = f->rules++, group, ret, bit_offset = 0;
+ unsigned int rule = f->rules, group, ret, bit_offset = 0;
- ret = pipapo_resize(f, f->rules - 1, f->rules);
+ ret = pipapo_resize(f, f->rules, f->rules + 1);
if (ret)
return ret;
+ f->rules++;
+
for (group = 0; group < f->groups; group++) {
int i, v;
u8 mask;
@@ -1051,7 +1142,9 @@ static int pipapo_expand(struct nft_pipapo_field *f,
step++;
if (step >= len) {
if (!masks) {
- pipapo_insert(f, base, 0);
+ err = pipapo_insert(f, base, 0);
+ if (err < 0)
+ return err;
masks = 1;
}
goto out;
@@ -1097,6 +1190,20 @@ static void pipapo_map(struct nft_pipapo_match *m,
}
/**
+ * pipapo_free_scratch() - Free per-CPU map at original address
+ * @m: Matching data
+ * @cpu: CPU number
+ */
+static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu)
+{
+ struct nft_pipapo_scratch *s;
+
+ s = *per_cpu_ptr(m->scratch, cpu);
+
+ kvfree(s);
+}
+
+/**
* pipapo_realloc_scratch() - Reallocate scratch maps for partial match results
* @clone: Copy of matching data with pending insertions and deletions
* @bsize_max: Maximum bucket size, scratch maps cover two buckets
@@ -1109,14 +1216,11 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
int i;
for_each_possible_cpu(i) {
- unsigned long *scratch;
-#ifdef NFT_PIPAPO_ALIGN
- unsigned long *scratch_aligned;
-#endif
+ struct nft_pipapo_scratch *scratch;
- scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 +
- NFT_PIPAPO_ALIGN_HEADROOM,
- GFP_KERNEL, cpu_to_node(i));
+ scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) +
+ NFT_PIPAPO_ALIGN_HEADROOM,
+ GFP_KERNEL_ACCOUNT, cpu_to_node(i));
if (!scratch) {
/* On failure, there's no need to undo previous
* allocations: this means that some scratch maps have
@@ -1128,50 +1232,82 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
return -ENOMEM;
}
- kfree(*per_cpu_ptr(clone->scratch, i));
-
+ pipapo_free_scratch(clone, i);
+ local_lock_init(&scratch->bh_lock);
*per_cpu_ptr(clone->scratch, i) = scratch;
-
-#ifdef NFT_PIPAPO_ALIGN
- scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch);
- *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned;
-#endif
}
return 0;
}
+static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ const struct net *net = read_pnet(&set->net);
+
+ return lockdep_is_held(&nft_pernet(net)->commit_mutex);
+#else
+ return true;
+#endif
+}
+
+static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old);
+
+/**
+ * pipapo_maybe_clone() - Build clone for pending data changes, if not existing
+ * @set: nftables API set representation
+ *
+ * Return: newly created or existing clone, if any. NULL on allocation failure
+ */
+static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m;
+
+ if (priv->clone)
+ return priv->clone;
+
+ m = rcu_dereference_protected(priv->match,
+ nft_pipapo_transaction_mutex_held(set));
+ priv->clone = pipapo_clone(m);
+
+ return priv->clone;
+}
+
/**
* nft_pipapo_insert() - Validate and insert ranged elements
* @net: Network namespace
* @set: nftables API set representation
* @elem: nftables API element representation containing key data
- * @ext2: Filled with pointer to &struct nft_set_ext in inserted element
+ * @elem_priv: Filled with pointer to &struct nft_set_ext in inserted element
*
* Return: 0 on success, error pointer on failure.
*/
static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext2)
+ struct nft_elem_priv **elem_priv)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
const u8 *start = (const u8 *)elem->key.val.data, *end;
- struct nft_pipapo_elem *e = elem->priv, *dup;
- struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *m = priv->clone;
+ struct nft_pipapo_match *m = pipapo_maybe_clone(set);
u8 genmask = nft_genmask_next(net);
+ struct nft_pipapo_elem *e, *dup;
+ u64 tstamp = nft_net_tstamp(net);
struct nft_pipapo_field *f;
const u8 *start_p, *end_p;
int i, bsize_max, err = 0;
+ if (!m)
+ return -ENOMEM;
+
if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
end = (const u8 *)nft_set_ext_key_end(ext)->data;
else
end = start;
- dup = pipapo_get(net, set, start, genmask);
- if (!IS_ERR(dup)) {
+ dup = pipapo_get(m, start, genmask, tstamp);
+ if (dup) {
/* Check if we already have the same exact entry */
const struct nft_data *dup_key, *dup_end;
@@ -1183,30 +1319,31 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) &&
!memcmp(end, dup_end->data, sizeof(*dup_end->data))) {
- *ext2 = &dup->ext;
+ *elem_priv = &dup->priv;
return -EEXIST;
}
return -ENOTEMPTY;
}
- if (PTR_ERR(dup) == -ENOENT) {
- /* Look for partially overlapping entries */
- dup = pipapo_get(net, set, end, nft_genmask_next(net));
- }
-
- if (PTR_ERR(dup) != -ENOENT) {
- if (IS_ERR(dup))
- return PTR_ERR(dup);
- *ext2 = &dup->ext;
+ /* Look for partially overlapping entries */
+ dup = pipapo_get(m, end, nft_genmask_next(net), tstamp);
+ if (dup) {
+ *elem_priv = &dup->priv;
return -ENOTEMPTY;
}
/* Validate */
start_p = start;
end_p = end;
+
+ /* some helpers return -1, or 0 >= for valid rule pos,
+ * so we cannot support more than INT_MAX rules at this time.
+ */
+ BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX);
+
nft_pipapo_for_each_field(f, i, m) {
- if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX)
+ if (f->rules >= NFT_PIPAPO_RULE0_MAX)
return -ENOSPC;
if (memcmp(start_p, end_p,
@@ -1218,8 +1355,6 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
}
/* Insert */
- priv->dirty = true;
-
bsize_max = m->bsize_max;
nft_pipapo_for_each_field(f, i, m) {
@@ -1234,6 +1369,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
else
ret = pipapo_expand(f, start, end, f->groups * f->bb);
+ if (ret < 0)
+ return ret;
+
if (f->bsize > bsize_max)
bsize_max = f->bsize;
@@ -1255,7 +1393,8 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
put_cpu_ptr(m->scratch);
}
- *ext2 = &e->ext;
+ e = nft_elem_priv_cast(elem->priv);
+ *elem_priv = &e->priv;
pipapo_map(m, rulemap, e);
@@ -1266,7 +1405,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
* pipapo_clone() - Clone matching data to create new working copy
* @old: Existing matching data
*
- * Return: copy of matching data passed as 'old', error pointer on failure
+ * Return: copy of matching data passed as 'old' or NULL.
*/
static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
{
@@ -1274,10 +1413,9 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
struct nft_pipapo_match *new;
int i;
- new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count,
- GFP_KERNEL);
+ new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT);
if (!new)
- return ERR_PTR(-ENOMEM);
+ return NULL;
new->field_count = old->field_count;
new->bsize_max = old->bsize_max;
@@ -1286,11 +1424,6 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
if (!new->scratch)
goto out_scratch;
-#ifdef NFT_PIPAPO_ALIGN
- new->scratch_aligned = alloc_percpu(*new->scratch_aligned);
- if (!new->scratch_aligned)
- goto out_scratch;
-#endif
for_each_possible_cpu(i)
*per_cpu_ptr(new->scratch, i) = NULL;
@@ -1304,28 +1437,41 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
for (i = 0; i < old->field_count; i++) {
unsigned long *new_lt;
+ ssize_t lt_size;
memcpy(dst, src, offsetof(struct nft_pipapo_field, lt));
- new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) *
- src->bsize * sizeof(*dst->lt) +
- NFT_PIPAPO_ALIGN_HEADROOM,
- GFP_KERNEL);
+ lt_size = lt_calculate_size(src->groups, src->bb, src->bsize);
+ if (lt_size < 0)
+ goto out_lt;
+
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
if (!new_lt)
goto out_lt;
- NFT_PIPAPO_LT_ASSIGN(dst, new_lt);
+ dst->lt = new_lt;
memcpy(NFT_PIPAPO_LT_ALIGN(new_lt),
NFT_PIPAPO_LT_ALIGN(src->lt),
src->bsize * sizeof(*dst->lt) *
src->groups * NFT_PIPAPO_BUCKETS(src->bb));
- dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL);
- if (!dst->mt)
- goto out_mt;
+ if (src->rules > 0) {
+ if (src->rules_alloc > (INT_MAX / sizeof(*src->mt)))
+ goto out_mt;
+
+ dst->mt = kvmalloc_array(src->rules_alloc,
+ sizeof(*src->mt),
+ GFP_KERNEL_ACCOUNT);
+ if (!dst->mt)
+ goto out_mt;
+
+ memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
+ } else {
+ dst->mt = NULL;
+ dst->rules_alloc = 0;
+ }
- memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
src++;
dst++;
}
@@ -1342,15 +1488,12 @@ out_lt:
}
out_scratch_realloc:
for_each_possible_cpu(i)
- kfree(*per_cpu_ptr(new->scratch, i));
-#ifdef NFT_PIPAPO_ALIGN
- free_percpu(new->scratch_aligned);
-#endif
+ pipapo_free_scratch(new, i);
out_scratch:
free_percpu(new->scratch);
kfree(new);
- return ERR_PTR(-ENOMEM);
+ return NULL;
}
/**
@@ -1382,10 +1525,10 @@ out_scratch:
*
* Return: Number of rules that originated from the same entry as @first.
*/
-static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first)
+static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first)
{
struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */
- int r;
+ unsigned int r;
for (r = first; r < f->rules; r++) {
if (r != first && e != f->mt[r].e)
@@ -1438,8 +1581,9 @@ static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first)
* 0 1 2
* element pointers: 0x42 0x42 0x44
*/
-static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules,
- int start, int n, int to_offset, bool is_last)
+static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules,
+ unsigned int start, unsigned int n,
+ unsigned int to_offset, bool is_last)
{
int i;
@@ -1529,21 +1673,35 @@ static void pipapo_drop(struct nft_pipapo_match *m,
}
}
+static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
+ struct nft_pipapo_elem *e)
+
+{
+ nft_setelem_data_deactivate(net, set, &e->priv);
+}
+
/**
* pipapo_gc() - Drop expired entries from set, destroy start and end elements
* @set: nftables API set representation
* @m: Matching data
*/
-static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
+static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
{
struct nft_pipapo *priv = nft_set_priv(set);
- int rules_f0, first_rule = 0;
+ struct net *net = read_pnet(&set->net);
+ unsigned int rules_f0, first_rule = 0;
+ u64 tstamp = nft_net_tstamp(net);
struct nft_pipapo_elem *e;
+ struct nft_trans_gc *gc;
+
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
- struct nft_pipapo_field *f;
- int i, start, rules_fx;
+ const struct nft_pipapo_field *f;
+ unsigned int i, start, rules_fx;
start = first_rule;
rules_fx = rules_f0;
@@ -1562,13 +1720,18 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
f--;
i--;
e = f->mt[rulemap[i].to].e;
- if (nft_set_elem_expired(&e->ext) &&
- !nft_set_elem_mark_busy(&e->ext)) {
- priv->dirty = true;
- pipapo_drop(m, rulemap);
- rcu_barrier();
- nft_set_elem_destroy(set, e, true);
+ /* synchronous gc never fails, there is no need to set on
+ * NFT_SET_ELEM_DEAD_BIT.
+ */
+ if (__nft_set_elem_expired(&e->ext, tstamp)) {
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ return;
+
+ nft_pipapo_gc_deactivate(net, set, e);
+ pipapo_drop(m, rulemap);
+ nft_trans_gc_elem_add(gc, e);
/* And check again current first rule, which is now the
* first we haven't checked.
@@ -1578,11 +1741,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
}
}
- e = nft_set_catchall_gc(set);
- if (e)
- nft_set_elem_destroy(set, e, true);
-
- priv->last_gc = jiffies;
+ gc = nft_trans_gc_catchall_sync(gc);
+ if (gc) {
+ nft_trans_gc_queue_sync_done(gc);
+ priv->last_gc = jiffies;
+ }
}
/**
@@ -1600,32 +1763,33 @@ static void pipapo_free_fields(struct nft_pipapo_match *m)
}
}
-/**
- * pipapo_reclaim_match - RCU callback to free fields from old matching data
- * @rcu: RCU head
- */
-static void pipapo_reclaim_match(struct rcu_head *rcu)
+static void pipapo_free_match(struct nft_pipapo_match *m)
{
- struct nft_pipapo_match *m;
int i;
- m = container_of(rcu, struct nft_pipapo_match, rcu);
-
for_each_possible_cpu(i)
- kfree(*per_cpu_ptr(m->scratch, i));
+ pipapo_free_scratch(m, i);
-#ifdef NFT_PIPAPO_ALIGN
- free_percpu(m->scratch_aligned);
-#endif
free_percpu(m->scratch);
-
pipapo_free_fields(m);
kfree(m);
}
/**
- * pipapo_commit() - Replace lookup data with current working copy
+ * pipapo_reclaim_match - RCU callback to free fields from old matching data
+ * @rcu: RCU head
+ */
+static void pipapo_reclaim_match(struct rcu_head *rcu)
+{
+ struct nft_pipapo_match *m;
+
+ m = container_of(rcu, struct nft_pipapo_match, rcu);
+ pipapo_free_match(m);
+}
+
+/**
+ * nft_pipapo_commit() - Replace lookup data with current working copy
* @set: nftables API set representation
*
* While at it, check if we should perform garbage collection on the working
@@ -1635,108 +1799,91 @@ static void pipapo_reclaim_match(struct rcu_head *rcu)
* We also need to create a new working copy for subsequent insertions and
* deletions.
*/
-static void pipapo_commit(const struct nft_set *set)
+static void nft_pipapo_commit(struct nft_set *set)
{
struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *new_clone, *old;
+ struct nft_pipapo_match *old;
- if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
- pipapo_gc(set, priv->clone);
-
- if (!priv->dirty)
+ if (!priv->clone)
return;
- new_clone = pipapo_clone(priv->clone);
- if (IS_ERR(new_clone))
- return;
+ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+ pipapo_gc(set, priv->clone);
- priv->dirty = false;
+ old = rcu_replace_pointer(priv->match, priv->clone,
+ nft_pipapo_transaction_mutex_held(set));
+ priv->clone = NULL;
- old = rcu_access_pointer(priv->match);
- rcu_assign_pointer(priv->match, priv->clone);
if (old)
call_rcu(&old->rcu, pipapo_reclaim_match);
+}
+
+static void nft_pipapo_abort(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
- priv->clone = new_clone;
+ if (!priv->clone)
+ return;
+ pipapo_free_match(priv->clone);
+ priv->clone = NULL;
}
/**
* nft_pipapo_activate() - Mark element reference as active given key, commit
* @net: Network namespace
* @set: nftables API set representation
- * @elem: nftables API element representation containing key data
+ * @elem_priv: nftables API element representation containing key data
*
* On insertion, elements are added to a copy of the matching data currently
- * in use for lookups, and not directly inserted into current lookup data, so
- * we'll take care of that by calling pipapo_commit() here. Both
+ * in use for lookups, and not directly inserted into current lookup data. Both
* nft_pipapo_insert() and nft_pipapo_activate() are called once for each
* element, hence we can't purpose either one as a real commit operation.
*/
static void nft_pipapo_activate(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_pipapo_elem *e;
-
- e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0);
- if (IS_ERR(e))
- return;
+ struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv);
- nft_set_elem_change_active(net, set, &e->ext);
- nft_set_elem_clear_busy(&e->ext);
-
- pipapo_commit(set);
+ nft_clear(net, &e->ext);
}
/**
- * pipapo_deactivate() - Check that element is in set, mark as inactive
+ * nft_pipapo_deactivate() - Search for element and make it inactive
* @net: Network namespace
* @set: nftables API set representation
- * @data: Input key data
- * @ext: nftables API extension pointer, used to check for end element
- *
- * This is a convenience function that can be called from both
- * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same
- * operation.
+ * @elem: nftables API element representation containing key data
*
* Return: deactivated element if found, NULL otherwise.
*/
-static void *pipapo_deactivate(const struct net *net, const struct nft_set *set,
- const u8 *data, const struct nft_set_ext *ext)
+static struct nft_elem_priv *
+nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_pipapo_match *m = pipapo_maybe_clone(set);
struct nft_pipapo_elem *e;
- e = pipapo_get(net, set, data, nft_genmask_next(net));
- if (IS_ERR(e))
+ /* removal must occur on priv->clone, if we are low on memory
+ * we have no choice and must fail the removal request.
+ */
+ if (!m)
return NULL;
- nft_set_elem_change_active(net, set, &e->ext);
-
- return e;
-}
+ e = pipapo_get(m, (const u8 *)elem->key.val.data,
+ nft_genmask_next(net), nft_net_tstamp(net));
+ if (!e)
+ return NULL;
-/**
- * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive
- * @net: Network namespace
- * @set: nftables API set representation
- * @elem: nftables API element representation containing key data
- *
- * Return: deactivated element if found, NULL otherwise.
- */
-static void *nft_pipapo_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
-{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ nft_set_elem_change_active(net, set, &e->ext);
- return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext);
+ return &e->priv;
}
/**
- * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive
+ * nft_pipapo_flush() - make element inactive
* @net: Network namespace
* @set: nftables API set representation
- * @elem: nftables API element representation containing key data
+ * @elem_priv: nftables API element representation containing key data
*
* This is functionally the same as nft_pipapo_deactivate(), with a slightly
* different interface, and it's also called once for each element in a set
@@ -1750,13 +1897,12 @@ static void *nft_pipapo_deactivate(const struct net *net,
*
* Return: true if element was found and deactivated.
*/
-static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set,
- void *elem)
+static void nft_pipapo_flush(const struct net *net, const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_pipapo_elem *e = elem;
+ struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv);
- return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext),
- &e->ext);
+ nft_set_elem_change_active(net, set, &e->ext);
}
/**
@@ -1879,7 +2025,7 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
* nft_pipapo_remove() - Remove element given key, commit
* @net: Network namespace
* @set: nftables API set representation
- * @elem: nftables API element representation containing key data
+ * @elem_priv: nftables API element representation containing key data
*
* Similarly to nft_pipapo_activate(), this is used as commit operation by the
* API, but it's called once per element in the pending transaction, so we can't
@@ -1887,20 +2033,17 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
* the matched element here, if any, and commit the updated matching data.
*/
static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m = priv->clone;
- struct nft_pipapo_elem *e = elem->priv;
- int rules_f0, first_rule = 0;
+ unsigned int rules_f0, first_rule = 0;
+ struct nft_pipapo_elem *e;
const u8 *data;
+ e = nft_elem_priv_cast(elem_priv);
data = (const u8 *)nft_set_ext_key(&e->ext);
- e = pipapo_get(net, set, data, 0);
- if (IS_ERR(e))
- return;
-
while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
const u8 *match_start, *match_end;
@@ -1908,12 +2051,18 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
int i, start, rules_fx;
match_start = data;
- match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data;
+
+ if (nft_set_ext_exists(&e->ext, NFT_SET_EXT_KEY_END))
+ match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data;
+ else
+ match_end = data;
start = first_rule;
rules_fx = rules_f0;
nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1;
+
if (!pipapo_match_field(f, start, rules_fx,
match_start, match_end))
break;
@@ -1926,49 +2075,42 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
- }
- if (i == m->field_count) {
- priv->dirty = true;
- pipapo_drop(m, rulemap);
- pipapo_commit(set);
- return;
+ if (last && f->mt[rulemap[i].to].e == e) {
+ pipapo_drop(m, rulemap);
+ return;
+ }
}
first_rule += rules_f0;
}
+
+ WARN_ON_ONCE(1); /* elem_priv not found */
}
/**
- * nft_pipapo_walk() - Walk over elements
+ * nft_pipapo_do_walk() - Walk over elements in m
* @ctx: nftables API context
* @set: nftables API set representation
+ * @m: matching data pointing to key mapping array
* @iter: Iterator
*
* As elements are referenced in the mapping array for the last field, directly
* scan that array: there's no need to follow rule mappings from the first
- * field.
+ * field. @m is protected either by RCU read lock or by transaction mutex.
*/
-static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ const struct nft_pipapo_match *m,
+ struct nft_set_iter *iter)
{
- struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *m;
- struct nft_pipapo_field *f;
- int i, r;
-
- rcu_read_lock();
- m = rcu_dereference(priv->match);
-
- if (unlikely(!m))
- goto out;
+ const struct nft_pipapo_field *f;
+ unsigned int i, r;
for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
;
for (r = 0; r < f->rules; r++) {
struct nft_pipapo_elem *e;
- struct nft_set_elem elem;
if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
continue;
@@ -1977,21 +2119,52 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
goto cont;
e = f->mt[r].e;
- if (nft_set_elem_expired(&e->ext))
- goto cont;
- elem.priv = e;
-
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &e->priv);
if (iter->err < 0)
- goto out;
+ return;
cont:
iter->count++;
}
+}
-out:
- rcu_read_unlock();
+/**
+ * nft_pipapo_walk() - Walk over elements
+ * @ctx: nftables API context
+ * @set: nftables API set representation
+ * @iter: Iterator
+ *
+ * Test if destructive action is needed or not, clone active backend if needed
+ * and call the real function to work on the data.
+ */
+static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ m = pipapo_maybe_clone(set);
+ if (!m) {
+ iter->err = -ENOMEM;
+ return;
+ }
+
+ nft_pipapo_do_walk(ctx, set, m, iter);
+ break;
+ case NFT_ITER_READ:
+ rcu_read_lock();
+ m = rcu_dereference(priv->match);
+ nft_pipapo_do_walk(ctx, set, m, iter);
+ rcu_read_unlock();
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
}
/**
@@ -2054,20 +2227,24 @@ static int nft_pipapo_init(const struct nft_set *set,
struct nft_pipapo_field *f;
int err, i, field_count;
+ BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0);
+
field_count = desc->field_count ? : 1;
+ BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255);
+ BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT);
+
if (field_count > NFT_PIPAPO_MAX_FIELDS)
return -EINVAL;
- m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count,
- GFP_KERNEL);
+ m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL);
if (!m)
return -ENOMEM;
m->field_count = field_count;
m->bsize_max = 0;
- m->scratch = alloc_percpu(unsigned long *);
+ m->scratch = alloc_percpu(struct nft_pipapo_scratch *);
if (!m->scratch) {
err = -ENOMEM;
goto out_scratch;
@@ -2075,20 +2252,14 @@ static int nft_pipapo_init(const struct nft_set *set,
for_each_possible_cpu(i)
*per_cpu_ptr(m->scratch, i) = NULL;
-#ifdef NFT_PIPAPO_ALIGN
- m->scratch_aligned = alloc_percpu(unsigned long *);
- if (!m->scratch_aligned) {
- err = -ENOMEM;
- goto out_free;
- }
- for_each_possible_cpu(i)
- *per_cpu_ptr(m->scratch_aligned, i) = NULL;
-#endif
-
rcu_head_init(&m->rcu);
nft_pipapo_for_each_field(f, i, m) {
- int len = desc->field_len[i] ? : set->klen;
+ unsigned int len = desc->field_len[i] ? : set->klen;
+
+ /* f->groups is u8 */
+ BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES *
+ BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256);
f->bb = NFT_PIPAPO_GROUP_BITS_INIT;
f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f);
@@ -2097,28 +2268,15 @@ static int nft_pipapo_init(const struct nft_set *set,
f->bsize = 0;
f->rules = 0;
- NFT_PIPAPO_LT_ASSIGN(f, NULL);
+ f->rules_alloc = 0;
+ f->lt = NULL;
f->mt = NULL;
}
- /* Create an initial clone of matching data for next insertion */
- priv->clone = pipapo_clone(m);
- if (IS_ERR(priv->clone)) {
- err = PTR_ERR(priv->clone);
- goto out_free;
- }
-
- priv->dirty = false;
-
rcu_assign_pointer(priv->match, m);
return 0;
-out_free:
-#ifdef NFT_PIPAPO_ALIGN
- free_percpu(m->scratch_aligned);
-#endif
- free_percpu(m->scratch);
out_scratch:
kfree(m);
@@ -2127,14 +2285,16 @@ out_scratch:
/**
* nft_set_pipapo_match_destroy() - Destroy elements from key mapping array
+ * @ctx: context
* @set: nftables API set representation
* @m: matching data pointing to key mapping array
*/
-static void nft_set_pipapo_match_destroy(const struct nft_set *set,
+static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
struct nft_pipapo_match *m)
{
struct nft_pipapo_field *f;
- int i, r;
+ unsigned int i, r;
for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
;
@@ -2147,54 +2307,32 @@ static void nft_set_pipapo_match_destroy(const struct nft_set *set,
e = f->mt[r].e;
- nft_set_elem_destroy(set, e, true);
+ nf_tables_set_elem_destroy(ctx, set, &e->priv);
}
}
/**
* nft_pipapo_destroy() - Free private data for set and all committed elements
+ * @ctx: context
* @set: nftables API set representation
*/
-static void nft_pipapo_destroy(const struct nft_set *set)
+static void nft_pipapo_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m;
- int cpu;
m = rcu_dereference_protected(priv->match, true);
- if (m) {
- rcu_barrier();
-
- nft_set_pipapo_match_destroy(set, m);
-
-#ifdef NFT_PIPAPO_ALIGN
- free_percpu(m->scratch_aligned);
-#endif
- for_each_possible_cpu(cpu)
- kfree(*per_cpu_ptr(m->scratch, cpu));
- free_percpu(m->scratch);
- pipapo_free_fields(m);
- kfree(m);
- priv->match = NULL;
- }
if (priv->clone) {
- m = priv->clone;
-
- if (priv->dirty)
- nft_set_pipapo_match_destroy(set, m);
-
-#ifdef NFT_PIPAPO_ALIGN
- free_percpu(priv->clone->scratch_aligned);
-#endif
- for_each_possible_cpu(cpu)
- kfree(*per_cpu_ptr(priv->clone->scratch, cpu));
- free_percpu(priv->clone->scratch);
-
- pipapo_free_fields(priv->clone);
- kfree(priv->clone);
+ nft_set_pipapo_match_destroy(ctx, set, priv->clone);
+ pipapo_free_match(priv->clone);
priv->clone = NULL;
+ } else {
+ nft_set_pipapo_match_destroy(ctx, set, m);
}
+
+ pipapo_free_match(m);
}
/**
@@ -2230,6 +2368,8 @@ const struct nft_set_type nft_set_pipapo_type = {
.init = nft_pipapo_init,
.destroy = nft_pipapo_destroy,
.gc_init = nft_pipapo_gc_init,
+ .commit = nft_pipapo_commit,
+ .abort = nft_pipapo_abort,
.elemsize = offsetof(struct nft_pipapo_elem, ext),
},
};
@@ -2252,6 +2392,8 @@ const struct nft_set_type nft_set_pipapo_avx2_type = {
.init = nft_pipapo_init,
.destroy = nft_pipapo_destroy,
.gc_init = nft_pipapo_gc_init,
+ .commit = nft_pipapo_commit,
+ .abort = nft_pipapo_abort,
.elemsize = offsetof(struct nft_pipapo_elem, ext),
},
};
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index 25a75591583e..eaab422aa56a 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -70,15 +70,9 @@
#define NFT_PIPAPO_ALIGN_HEADROOM \
(NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN)
#define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN))
-#define NFT_PIPAPO_LT_ASSIGN(field, x) \
- do { \
- (field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \
- (field)->lt = (x); \
- } while (0)
#else
#define NFT_PIPAPO_ALIGN_HEADROOM 0
#define NFT_PIPAPO_LT_ALIGN(lt) (lt)
-#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x))
#endif /* NFT_PIPAPO_ALIGN */
#define nft_pipapo_for_each_field(field, index, match) \
@@ -110,44 +104,50 @@ union nft_pipapo_map_bucket {
/**
* struct nft_pipapo_field - Lookup, mapping tables and related data for a field
- * @groups: Amount of bit groups
* @rules: Number of inserted rules
* @bsize: Size of each bucket in lookup table, in longs
+ * @rules_alloc: Number of allocated rules, always >= rules
+ * @groups: Amount of bit groups
* @bb: Number of bits grouped together in lookup table buckets
* @lt: Lookup table: 'groups' rows of buckets
- * @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes
* @mt: Mapping table: one bucket per rule
*/
struct nft_pipapo_field {
- int groups;
- unsigned long rules;
- size_t bsize;
- int bb;
-#ifdef NFT_PIPAPO_ALIGN
- unsigned long *lt_aligned;
-#endif
+ unsigned int rules;
+ unsigned int bsize;
+ unsigned int rules_alloc;
+ u8 groups;
+ u8 bb;
unsigned long *lt;
union nft_pipapo_map_bucket *mt;
};
/**
+ * struct nft_pipapo_scratch - percpu data used for lookup and matching
+ * @bh_lock: PREEMPT_RT local spinlock
+ * @map_index: Current working bitmap index, toggled between field matches
+ * @__map: store partial matching results during lookup
+ */
+struct nft_pipapo_scratch {
+ local_lock_t bh_lock;
+ u8 map_index;
+ unsigned long __map[];
+};
+
+/**
* struct nft_pipapo_match - Data used for lookup and matching
- * @field_count Amount of fields in set
- * @scratch: Preallocated per-CPU maps for partial matching results
- * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes
+ * @field_count: Amount of fields in set
* @bsize_max: Maximum lookup table bucket size of all fields, in longs
- * @rcu Matching data is swapped on commits
+ * @scratch: Preallocated per-CPU maps for partial matching results
+ * @rcu: Matching data is swapped on commits
* @f: Fields, with lookup and mapping tables
*/
struct nft_pipapo_match {
- int field_count;
-#ifdef NFT_PIPAPO_ALIGN
- unsigned long * __percpu *scratch_aligned;
-#endif
- unsigned long * __percpu *scratch;
- size_t bsize_max;
+ u8 field_count;
+ unsigned int bsize_max;
+ struct nft_pipapo_scratch * __percpu *scratch;
struct rcu_head rcu;
- struct nft_pipapo_field f[];
+ struct nft_pipapo_field f[] __counted_by(field_count);
};
/**
@@ -155,14 +155,12 @@ struct nft_pipapo_match {
* @match: Currently in-use matching data
* @clone: Copy where pending insertions and deletions are kept
* @width: Total bytes to be matched for one packet, including padding
- * @dirty: Working copy has pending insertions or deletions
* @last_gc: Timestamp of last garbage collection run, jiffies
*/
struct nft_pipapo {
struct nft_pipapo_match __rcu *match;
struct nft_pipapo_match *clone;
int width;
- bool dirty;
unsigned long last_gc;
};
@@ -170,14 +168,17 @@ struct nft_pipapo_elem;
/**
* struct nft_pipapo_elem - API-facing representation of single set element
+ * @priv: element placeholder
* @ext: nftables API extensions
*/
struct nft_pipapo_elem {
- struct nft_set_ext ext;
+ struct nft_elem_priv priv;
+ struct nft_set_ext ext;
};
-int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
- union nft_pipapo_map_bucket *mt, bool match_only);
+int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
+ unsigned long *dst,
+ const union nft_pipapo_map_bucket *mt, bool match_only);
/**
* pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets
@@ -185,7 +186,7 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
* @dst: Area to store result
* @data: Input data selecting table buckets
*/
-static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
+static inline void pipapo_and_field_buckets_4bit(const struct nft_pipapo_field *f,
unsigned long *dst,
const u8 *data)
{
@@ -213,7 +214,7 @@ static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
* @dst: Area to store result
* @data: Input data selecting table buckets
*/
-static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f,
+static inline void pipapo_and_field_buckets_8bit(const struct nft_pipapo_field *f,
unsigned long *dst,
const u8 *data)
{
@@ -277,4 +278,25 @@ static u64 pipapo_estimate_size(const struct nft_set_desc *desc)
return size;
}
+/**
+ * pipapo_resmap_init() - Initialise result map before first use
+ * @m: Matching data, including mapping table
+ * @res_map: Result map
+ *
+ * Initialize all bits covered by the first field to one, so that after
+ * the first step, only the matching bits of the first bit group remain.
+ *
+ * If other fields have a large bitmap, set remainder of res_map to 0.
+ */
+static inline void pipapo_resmap_init(const struct nft_pipapo_match *m, unsigned long *res_map)
+{
+ const struct nft_pipapo_field *f = m->f;
+ int i;
+
+ for (i = 0; i < f->bsize; i++)
+ res_map[i] = ULONG_MAX;
+
+ for (i = f->bsize; i < m->bsize_max; i++)
+ res_map[i] = 0ul;
+}
#endif /* _NFT_SET_PIPAPO_H */
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index 52e0d026d30a..7ff90325c97f 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -57,7 +57,7 @@
/* Jump to label if @reg is zero */
#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
- asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
+ asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
"je %l[" #label "]" : : : : label)
/* Store 256 bits from YMM register into memory. Contrary to bucket load
@@ -71,9 +71,6 @@
#define NFT_PIPAPO_AVX2_ZERO(reg) \
asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
-/* Current working bitmap index, toggled between field matches */
-static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
-
/**
* nft_pipapo_avx2_prepare() - Prepare before main algorithm body
*
@@ -215,8 +212,9 @@ static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
@@ -277,8 +275,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
@@ -353,8 +352,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
@@ -448,8 +448,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
@@ -537,8 +538,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
@@ -672,8 +674,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
@@ -729,8 +732,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
@@ -793,8 +797,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
@@ -868,8 +873,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
@@ -953,8 +959,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
@@ -987,8 +994,9 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_AND(3, 4, 7);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
- NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_AND(0, 3, 5);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
NFT_PIPAPO_AVX2_AND(2, 6, 7);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
@@ -1029,6 +1037,7 @@ nothing:
/**
* nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
+ * @mdata: Matching data, including mapping table
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
@@ -1044,15 +1053,17 @@ nothing:
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
-static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+static int nft_pipapo_avx2_lookup_slow(const struct nft_pipapo_match *mdata,
+ unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
unsigned long bsize = f->bsize;
int i, ret = -1, b;
if (first)
- memset(map, 0xff, bsize * sizeof(*map));
+ pipapo_resmap_init(mdata, map);
for (i = offset; i < bsize; i++) {
if (f->bb == 8)
@@ -1088,7 +1099,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
desc->field_count < NFT_PIPAPO_MIN_FIELDS)
return false;
- if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
+ if (!boot_cpu_has(X86_FEATURE_AVX2))
return false;
est->size = pipapo_estimate_size(desc);
@@ -1103,65 +1114,78 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
}
/**
- * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
- * @net: Network namespace
- * @set: nftables API set representation
- * @key: nftables API element representation containing key data
- * @ext: nftables API extension pointer, filled with matching reference
+ * pipapo_resmap_init_avx2() - Initialise result map before first use
+ * @m: Matching data, including mapping table
+ * @res_map: Result map
+ *
+ * Like pipapo_resmap_init() but do not set start map bits covered by the first field.
+ */
+static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map)
+{
+ const struct nft_pipapo_field *f = m->f;
+ int i;
+
+ /* Starting map doesn't need to be set to all-ones for this implementation,
+ * but we do need to zero the remaining bits, if any.
+ */
+ for (i = f->bsize; i < m->bsize_max; i++)
+ res_map[i] = 0ul;
+}
+
+/**
+ * pipapo_get_avx2() - Lookup function for AVX2 implementation
+ * @m: Storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: Timestamp to check for expired elements
*
* For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
*
* This implementation exploits the repetitive characteristic of the algorithm
* to provide a fast, vectorised version using the AVX2 SIMD instruction set.
*
- * Return: true on match, false otherwise.
+ * The caller must check that the FPU is usable.
+ * This function must be called with BH disabled.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
- struct nft_pipapo *priv = nft_set_priv(set);
- unsigned long *res, *fill, *scratch;
- u8 genmask = nft_genmask_cur(net);
- const u8 *rp = (const u8 *)key;
- struct nft_pipapo_match *m;
- struct nft_pipapo_field *f;
+ struct nft_pipapo_scratch *scratch;
+ const struct nft_pipapo_field *f;
+ unsigned long *res, *fill, *map;
bool map_index;
- int i, ret = 0;
+ int i;
- if (unlikely(!irq_fpu_usable()))
- return nft_pipapo_lookup(net, set, key, ext);
+ scratch = *raw_cpu_ptr(m->scratch);
+ if (unlikely(!scratch))
+ return NULL;
- m = rcu_dereference(priv->match);
+ __local_lock_nested_bh(&scratch->bh_lock);
+ map_index = scratch->map_index;
+ map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
+ res = map + (map_index ? m->bsize_max : 0);
+ fill = map + (map_index ? 0 : m->bsize_max);
+
+ pipapo_resmap_init_avx2(m, res);
- /* This also protects access to all data related to scratch maps.
- *
- * Note that we don't need a valid MXCSR state for any of the
+ /* Note that we don't need a valid MXCSR state for any of the
* operations we use here, so pass 0 as mask and spare a LDMXCSR
* instruction.
*/
kernel_fpu_begin_mask(0);
- scratch = *raw_cpu_ptr(m->scratch_aligned);
- if (unlikely(!scratch)) {
- kernel_fpu_end();
- return false;
- }
- map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
-
- res = scratch + (map_index ? m->bsize_max : 0);
- fill = scratch + (map_index ? 0 : m->bsize_max);
-
- /* Starting map doesn't need to be set for this implementation */
-
nft_pipapo_avx2_prepare();
-next_match:
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1, first = !i;
+ int ret = 0;
#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
- ret, rp, \
+ ret, data, \
first, last))
if (likely(f->bb == 8)) {
@@ -1176,8 +1200,8 @@ next_match:
} else if (f->groups == 16) {
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
} else {
- ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
- ret, rp,
+ ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
+ ret, data,
first, last);
}
} else {
@@ -1192,8 +1216,8 @@ next_match:
} else if (f->groups == 32) {
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
} else {
- ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
- ret, rp,
+ ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
+ ret, data,
first, last);
}
}
@@ -1201,28 +1225,78 @@ next_match:
#undef NFT_SET_PIPAPO_AVX2_LOOKUP
- if (ret < 0)
- goto out;
+next_match:
+ if (ret < 0) {
+ scratch->map_index = map_index;
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return NULL;
+ }
if (last) {
- *ext = &f->mt[ret].e->ext;
- if (unlikely(nft_set_elem_expired(*ext) ||
- !nft_set_elem_active(*ext, genmask))) {
- ret = 0;
+ struct nft_pipapo_elem *e;
+
+ e = f->mt[ret].e;
+ if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) ||
+ !nft_set_elem_active(&e->ext, genmask))) {
+ ret = pipapo_refill(res, f->bsize, f->rules,
+ fill, f->mt, last);
goto next_match;
}
- goto out;
+ scratch->map_index = map_index;
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return e;
}
+ map_index = !map_index;
swap(res, fill);
- rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
-out:
- if (i % 2)
- raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index);
kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return NULL;
+}
+
+/**
+ * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @key: nftables API element representation containing key data
+ *
+ * This function is called from the data path. It will search for
+ * an element matching the given key in the current active copy using
+ * the AVX2 routines if the FPU is usable or fall back to the generic
+ * implementation of the algorithm otherwise.
+ *
+ * Return: nftables API extension pointer or NULL if no match.
+ */
+const struct nft_set_ext *
+nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+ const u8 *rp = (const u8 *)key;
+ const struct nft_pipapo_elem *e;
+
+ local_bh_disable();
+
+ if (unlikely(!irq_fpu_usable())) {
+ const struct nft_set_ext *ext;
+
+ ext = nft_pipapo_lookup(net, set, key);
+
+ local_bh_enable();
+ return ext;
+ }
+
+ m = rcu_dereference(priv->match);
+
+ e = pipapo_get_avx2(m, rp, 0, get_jiffies_64());
+ local_bh_enable();
- return ret >= 0;
+ return e ? &e->ext : NULL;
}
diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h
index dbb6aaca8a7a..c2999b63da3f 100644
--- a/net/netfilter/nft_set_pipapo_avx2.h
+++ b/net/netfilter/nft_set_pipapo_avx2.h
@@ -5,8 +5,12 @@
#include <asm/fpu/xstate.h>
#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
+struct nft_pipapo_match;
bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est);
+struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp);
#endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */
#endif /* _NFT_SET_PIPAPO_AVX2_H */
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 19ea4d3c3553..ca594161b840 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -19,10 +19,11 @@ struct nft_rbtree {
struct rb_root root;
rwlock_t lock;
seqcount_rwlock_t count;
- struct delayed_work gc_work;
+ unsigned long last_gc;
};
struct nft_rbtree_elem {
+ struct nft_elem_priv priv;
struct rb_node node;
struct nft_set_ext ext;
};
@@ -46,9 +47,14 @@ static int nft_rbtree_cmp(const struct nft_set *set,
set->klen);
}
-static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext,
- unsigned int seq)
+static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
+{
+ return nft_set_elem_expired(&rbe->ext);
+}
+
+static const struct nft_set_ext *
+__nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, unsigned int seq)
{
struct nft_rbtree *priv = nft_set_priv(set);
const struct nft_rbtree_elem *rbe, *interval = NULL;
@@ -59,7 +65,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
parent = rcu_dereference_raw(priv->root.rb_node);
while (parent != NULL) {
if (read_seqcount_retry(&priv->count, seq))
- return false;
+ return NULL;
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
@@ -71,7 +77,9 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
nft_rbtree_interval_end(rbe) &&
nft_rbtree_interval_start(interval))
continue;
- interval = rbe;
+ if (nft_set_elem_active(&rbe->ext, genmask) &&
+ !nft_rbtree_elem_expired(rbe))
+ interval = rbe;
} else if (d > 0)
parent = rcu_dereference_raw(parent->rb_right);
else {
@@ -80,51 +88,47 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
continue;
}
- if (nft_set_elem_expired(&rbe->ext))
- return false;
+ if (nft_rbtree_elem_expired(rbe))
+ return NULL;
if (nft_rbtree_interval_end(rbe)) {
if (nft_set_is_anonymous(set))
- return false;
+ return NULL;
parent = rcu_dereference_raw(parent->rb_left);
interval = NULL;
continue;
}
- *ext = &rbe->ext;
- return true;
+ return &rbe->ext;
}
}
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
- nft_set_elem_active(&interval->ext, genmask) &&
- !nft_set_elem_expired(&interval->ext) &&
- nft_rbtree_interval_start(interval)) {
- *ext = &interval->ext;
- return true;
- }
+ nft_rbtree_interval_start(interval))
+ return &interval->ext;
- return false;
+ return NULL;
}
INDIRECT_CALLABLE_SCOPE
-bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_rbtree *priv = nft_set_priv(set);
unsigned int seq = read_seqcount_begin(&priv->count);
- bool ret;
+ const struct nft_set_ext *ext;
- ret = __nft_rbtree_lookup(net, set, key, ext, seq);
- if (ret || !read_seqcount_retry(&priv->count, seq))
- return ret;
+ ext = __nft_rbtree_lookup(net, set, key, seq);
+ if (ext || !read_seqcount_retry(&priv->count, seq))
+ return ext;
read_lock_bh(&priv->lock);
seq = read_seqcount_begin(&priv->count);
- ret = __nft_rbtree_lookup(net, set, key, ext, seq);
+ ext = __nft_rbtree_lookup(net, set, key, seq);
read_unlock_bh(&priv->lock);
- return ret;
+ return ext;
}
static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
@@ -191,8 +195,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
return false;
}
-static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_rbtree_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
struct nft_rbtree *priv = nft_set_priv(set);
unsigned int seq = read_seqcount_begin(&priv->count);
@@ -203,48 +208,82 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
if (ret || !read_seqcount_retry(&priv->count, seq))
- return rbe;
+ return &rbe->priv;
read_lock_bh(&priv->lock);
seq = read_seqcount_begin(&priv->count);
ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
- if (!ret)
- rbe = ERR_PTR(-ENOENT);
read_unlock_bh(&priv->lock);
- return rbe;
+ if (!ret)
+ return ERR_PTR(-ENOENT);
+
+ return &rbe->priv;
}
-static int nft_rbtree_gc_elem(const struct nft_set *__set,
- struct nft_rbtree *priv,
- struct nft_rbtree_elem *rbe)
+static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ lockdep_assert_held_write(&priv->lock);
+ nft_setelem_data_deactivate(net, set, &rbe->priv);
+ rb_erase(&rbe->node, &priv->root);
+}
+
+static const struct nft_rbtree_elem *
+nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
{
struct nft_set *set = (struct nft_set *)__set;
struct rb_node *prev = rb_prev(&rbe->node);
+ struct net *net = read_pnet(&set->net);
struct nft_rbtree_elem *rbe_prev;
- struct nft_set_gc_batch *gcb;
+ struct nft_trans_gc *gc;
- gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC);
- if (!gcb)
- return -ENOMEM;
+ gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
+ if (!gc)
+ return ERR_PTR(-ENOMEM);
- /* search for expired end interval coming before this element. */
- do {
+ /* search for end interval coming before this element.
+ * end intervals don't carry a timeout extension, they
+ * are coupled with the interval start element.
+ */
+ while (prev) {
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
- if (nft_rbtree_interval_end(rbe_prev))
+ if (nft_rbtree_interval_end(rbe_prev) &&
+ nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY))
break;
prev = rb_prev(prev);
- } while (prev != NULL);
+ }
- rb_erase(&rbe_prev->node, &priv->root);
- rb_erase(&rbe->node, &priv->root);
- atomic_sub(2, &set->nelems);
+ rbe_prev = NULL;
+ if (prev) {
+ rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
+ nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev);
- nft_set_gc_batch_add(gcb, rbe);
- nft_set_gc_batch_complete(gcb);
+ /* There is always room in this trans gc for this element,
+ * memory allocation never actually happens, hence, the warning
+ * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
+ * this is synchronous gc which never fails.
+ */
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (WARN_ON_ONCE(!gc))
+ return ERR_PTR(-ENOMEM);
- return 0;
+ nft_trans_gc_elem_add(gc, rbe_prev);
+ }
+
+ nft_rbtree_gc_elem_remove(net, set, priv, rbe);
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (WARN_ON_ONCE(!gc))
+ return ERR_PTR(-ENOMEM);
+
+ nft_trans_gc_elem_add(gc, rbe);
+
+ nft_trans_gc_queue_sync_done(gc);
+
+ return rbe_prev;
}
static bool nft_rbtree_update_first(const struct nft_set *set,
@@ -265,13 +304,15 @@ static bool nft_rbtree_update_first(const struct nft_set *set,
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
struct nft_rbtree_elem *new,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
- struct rb_node *node, *parent, **p, *first = NULL;
+ struct rb_node *node, *next, *parent, **p, *first = NULL;
struct nft_rbtree *priv = nft_set_priv(set);
+ u8 cur_genmask = nft_genmask_cur(net);
u8 genmask = nft_genmask_next(net);
- int d, err;
+ u64 tstamp = nft_net_tstamp(net);
+ int d;
/* Descend the tree to search for an existing element greater than the
* key value to insert that is greater than the new element. This is the
@@ -307,17 +348,27 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
* Values stored in the tree are in reversed order, starting from
* highest to lowest value.
*/
- for (node = first; node != NULL; node = rb_next(node)) {
+ for (node = first; node != NULL; node = next) {
+ next = rb_next(node);
+
rbe = rb_entry(node, struct nft_rbtree_elem, node);
if (!nft_set_elem_active(&rbe->ext, genmask))
continue;
- /* perform garbage collection to avoid bogus overlap reports. */
- if (nft_set_elem_expired(&rbe->ext)) {
- err = nft_rbtree_gc_elem(set, priv, rbe);
- if (err < 0)
- return err;
+ /* perform garbage collection to avoid bogus overlap reports
+ * but skip new elements in this transaction.
+ */
+ if (__nft_set_elem_expired(&rbe->ext, tstamp) &&
+ nft_set_elem_active(&rbe->ext, cur_genmask)) {
+ const struct nft_rbtree_elem *removed_end;
+
+ removed_end = nft_rbtree_gc_elem(set, priv, rbe);
+ if (IS_ERR(removed_end))
+ return PTR_ERR(removed_end);
+
+ if (removed_end == rbe_le || removed_end == rbe_ge)
+ return -EAGAIN;
continue;
}
@@ -371,7 +422,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
*/
if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
- *ext = &rbe_ge->ext;
+ *elem_priv = &rbe_ge->priv;
return -EEXIST;
}
@@ -380,7 +431,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
*/
if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
- *ext = &rbe_le->ext;
+ *elem_priv = &rbe_le->priv;
return -EEXIST;
}
@@ -432,66 +483,74 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
struct nft_rbtree *priv = nft_set_priv(set);
- struct nft_rbtree_elem *rbe = elem->priv;
int err;
+ do {
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ cond_resched();
+
+ write_lock_bh(&priv->lock);
+ write_seqcount_begin(&priv->count);
+ err = __nft_rbtree_insert(net, set, rbe, elem_priv);
+ write_seqcount_end(&priv->count);
+ write_unlock_bh(&priv->lock);
+ } while (err == -EAGAIN);
+
+ return err;
+}
+
+static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe)
+{
write_lock_bh(&priv->lock);
write_seqcount_begin(&priv->count);
- err = __nft_rbtree_insert(net, set, rbe, ext);
+ rb_erase(&rbe->node, &priv->root);
write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
-
- return err;
}
static void nft_rbtree_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
struct nft_rbtree *priv = nft_set_priv(set);
- struct nft_rbtree_elem *rbe = elem->priv;
- write_lock_bh(&priv->lock);
- write_seqcount_begin(&priv->count);
- rb_erase(&rbe->node, &priv->root);
- write_seqcount_end(&priv->count);
- write_unlock_bh(&priv->lock);
+ nft_rbtree_erase(priv, rbe);
}
static void nft_rbtree_activate(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rbtree_elem *rbe = elem->priv;
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
- nft_set_elem_change_active(net, set, &rbe->ext);
- nft_set_elem_clear_busy(&rbe->ext);
+ nft_clear(net, &rbe->ext);
}
-static bool nft_rbtree_flush(const struct net *net,
- const struct nft_set *set, void *priv)
+static void nft_rbtree_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rbtree_elem *rbe = priv;
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
- if (!nft_set_elem_mark_busy(&rbe->ext) ||
- !nft_is_active(net, &rbe->ext)) {
- nft_set_elem_change_active(net, set, &rbe->ext);
- return true;
- }
- return false;
+ nft_set_elem_change_active(net, set, &rbe->ext);
}
-static void *nft_rbtree_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv);
const struct nft_rbtree *priv = nft_set_priv(set);
const struct rb_node *parent = priv->root.rb_node;
- struct nft_rbtree_elem *rbe, *this = elem->priv;
u8 genmask = nft_genmask_next(net);
+ u64 tstamp = nft_net_tstamp(net);
int d;
while (parent != NULL) {
@@ -512,72 +571,92 @@ static void *nft_rbtree_deactivate(const struct net *net,
nft_rbtree_interval_end(this)) {
parent = parent->rb_right;
continue;
+ } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) {
+ break;
} else if (!nft_set_elem_active(&rbe->ext, genmask)) {
parent = parent->rb_left;
continue;
}
- nft_rbtree_flush(net, set, rbe);
- return rbe;
+ nft_rbtree_flush(net, set, &rbe->priv);
+ return &rbe->priv;
}
}
return NULL;
}
-static void nft_rbtree_walk(const struct nft_ctx *ctx,
- struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_rbtree_do_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
- struct nft_set_elem elem;
struct rb_node *node;
- read_lock_bh(&priv->lock);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
rbe = rb_entry(node, struct nft_rbtree_elem, node);
if (iter->count < iter->skip)
goto cont;
- if (nft_set_elem_expired(&rbe->ext))
- goto cont;
- if (!nft_set_elem_active(&rbe->ext, iter->genmask))
- goto cont;
- elem.priv = rbe;
-
- iter->err = iter->fn(ctx, set, iter, &elem);
- if (iter->err < 0) {
- read_unlock_bh(&priv->lock);
+ iter->err = iter->fn(ctx, set, iter, &rbe->priv);
+ if (iter->err < 0)
return;
- }
cont:
iter->count++;
}
- read_unlock_bh(&priv->lock);
}
-static void nft_rbtree_gc(struct work_struct *work)
+static void nft_rbtree_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
{
- struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
- struct nft_set_gc_batch *gcb = NULL;
- struct nft_rbtree *priv;
- struct rb_node *node;
- struct nft_set *set;
- struct net *net;
- u8 genmask;
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+ nft_rbtree_do_walk(ctx, set, iter);
+ break;
+ case NFT_ITER_READ:
+ read_lock_bh(&priv->lock);
+ nft_rbtree_do_walk(ctx, set, iter);
+ read_unlock_bh(&priv->lock);
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ nft_setelem_data_deactivate(net, set, &rbe->priv);
+ nft_rbtree_erase(priv, rbe);
+}
+
+static void nft_rbtree_gc(struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe, *rbe_end = NULL;
+ struct net *net = read_pnet(&set->net);
+ u64 tstamp = nft_net_tstamp(net);
+ struct rb_node *node, *next;
+ struct nft_trans_gc *gc;
- priv = container_of(work, struct nft_rbtree, gc_work.work);
set = nft_set_container_of(priv);
net = read_pnet(&set->net);
- genmask = nft_genmask_cur(net);
- write_lock_bh(&priv->lock);
- write_seqcount_begin(&priv->count);
- for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
- rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
- if (!nft_set_elem_active(&rbe->ext, genmask))
- continue;
+ for (node = rb_first(&priv->root); node ; node = next) {
+ next = rb_next(node);
+
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
/* elements are reversed in the rbtree for historical reasons,
* from highest to lowest value, that is why end element is
@@ -587,51 +666,37 @@ static void nft_rbtree_gc(struct work_struct *work)
rbe_end = rbe;
continue;
}
- if (!nft_set_elem_expired(&rbe->ext))
+ if (!__nft_set_elem_expired(&rbe->ext, tstamp))
continue;
- if (nft_set_elem_mark_busy(&rbe->ext)) {
- rbe_end = NULL;
- continue;
- }
-
- if (rbe_prev) {
- rb_erase(&rbe_prev->node, &priv->root);
- rbe_prev = NULL;
- }
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (!gcb)
- break;
-
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe);
- rbe_prev = rbe;
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ goto try_later;
+ /* end element needs to be removed first, it has
+ * no timeout extension.
+ */
if (rbe_end) {
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe_end);
- rb_erase(&rbe_end->node, &priv->root);
+ nft_rbtree_gc_remove(net, set, priv, rbe_end);
+ nft_trans_gc_elem_add(gc, rbe_end);
rbe_end = NULL;
}
- node = rb_next(node);
- if (!node)
- break;
- }
- if (rbe_prev)
- rb_erase(&rbe_prev->node, &priv->root);
- write_seqcount_end(&priv->count);
- write_unlock_bh(&priv->lock);
- rbe = nft_set_catchall_gc(set);
- if (rbe) {
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb)
- nft_set_gc_batch_add(gcb, rbe);
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ goto try_later;
+
+ nft_rbtree_gc_remove(net, set, priv, rbe);
+ nft_trans_gc_elem_add(gc, rbe);
}
- nft_set_gc_batch_complete(gcb);
- queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
- nft_set_gc_interval(set));
+try_later:
+
+ if (gc) {
+ gc = nft_trans_gc_catchall_sync(gc);
+ nft_trans_gc_queue_sync_done(gc);
+ priv->last_gc = jiffies;
+ }
}
static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
@@ -646,30 +711,26 @@ static int nft_rbtree_init(const struct nft_set *set,
{
struct nft_rbtree *priv = nft_set_priv(set);
+ BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0);
+
rwlock_init(&priv->lock);
seqcount_rwlock_init(&priv->count, &priv->lock);
priv->root = RB_ROOT;
- INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
- if (set->flags & NFT_SET_TIMEOUT)
- queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
- nft_set_gc_interval(set));
-
return 0;
}
-static void nft_rbtree_destroy(const struct nft_set *set)
+static void nft_rbtree_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
struct rb_node *node;
- cancel_delayed_work_sync(&priv->gc_work);
- rcu_barrier();
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
- nft_set_elem_destroy(set, rbe, true);
+ nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
}
}
@@ -691,6 +752,61 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
+static void nft_rbtree_commit(struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+ nft_rbtree_gc(set);
+}
+
+static void nft_rbtree_gc_init(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ priv->last_gc = jiffies;
+}
+
+/* rbtree stores ranges as singleton elements, each range is composed of two
+ * elements ...
+ */
+static u32 nft_rbtree_ksize(u32 size)
+{
+ return size * 2;
+}
+
+/* ... hide this detail to userspace. */
+static u32 nft_rbtree_usize(u32 size)
+{
+ if (!size)
+ return 0;
+
+ return size / 2;
+}
+
+static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe;
+ struct rb_node *node;
+ const void *key;
+
+ node = rb_last(&priv->root);
+ if (!node)
+ return 0;
+
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ if (!nft_rbtree_interval_end(rbe))
+ return 0;
+
+ key = nft_set_ext_key(&rbe->ext);
+ if (memchr(key, 1, set->klen))
+ return 0;
+
+ /* this is the all-zero no-match element. */
+ return 1;
+}
+
const struct nft_set_type nft_set_rbtree_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
.ops = {
@@ -704,8 +820,13 @@ const struct nft_set_type nft_set_rbtree_type = {
.deactivate = nft_rbtree_deactivate,
.flush = nft_rbtree_flush,
.activate = nft_rbtree_activate,
+ .commit = nft_rbtree_commit,
+ .gc_init = nft_rbtree_gc_init,
.lookup = nft_rbtree_lookup,
.walk = nft_rbtree_walk,
.get = nft_rbtree_get,
+ .ksize = nft_rbtree_ksize,
+ .usize = nft_rbtree_usize,
+ .adjust_maxsize = nft_rbtree_adjust_maxsize,
},
};
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 85f8df87efda..36affbb697c2 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -9,7 +9,8 @@
struct nft_socket {
enum nft_socket_keys key:8;
- u8 level;
+ u8 level; /* cgroupv2 level to extract */
+ u8 level_user; /* cgroupv2 level provided by userspace */
u8 len;
union {
u8 dreg;
@@ -53,6 +54,28 @@ nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo
memcpy(dest, &cgid, sizeof(u64));
return true;
}
+
+/* process context only, uses current->nsproxy. */
+static noinline int nft_socket_cgroup_subtree_level(void)
+{
+ struct cgroup *cgrp = cgroup_get_from_path("/");
+ int level;
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ level = cgrp->level;
+
+ cgroup_put(cgrp);
+
+ if (level > 255)
+ return -ERANGE;
+
+ if (WARN_ON_ONCE(level < 0))
+ return -EINVAL;
+
+ return level;
+}
#endif
static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt)
@@ -107,16 +130,16 @@ static void nft_socket_eval(const struct nft_expr *expr,
break;
case NFT_SOCKET_MARK:
if (sk_fullsock(sk)) {
- *dest = sk->sk_mark;
+ *dest = READ_ONCE(sk->sk_mark);
} else {
regs->verdict.code = NFT_BREAK;
- return;
+ goto out_put_sk;
}
break;
case NFT_SOCKET_WILDCARD:
if (!sk_fullsock(sk)) {
regs->verdict.code = NFT_BREAK;
- return;
+ goto out_put_sk;
}
nft_socket_wildcard(pkt, regs, sk, dest);
break;
@@ -124,7 +147,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
case NFT_SOCKET_CGROUPV2:
if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) {
regs->verdict.code = NFT_BREAK;
- return;
+ goto out_put_sk;
}
break;
#endif
@@ -133,14 +156,15 @@ static void nft_socket_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
}
+out_put_sk:
if (sk != skb->sk)
sock_gen_put(sk);
}
static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
- [NFTA_SOCKET_KEY] = { .type = NLA_U32 },
+ [NFTA_SOCKET_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_SOCKET_DREG] = { .type = NLA_U32 },
- [NFTA_SOCKET_LEVEL] = { .type = NLA_U32 },
+ [NFTA_SOCKET_LEVEL] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_socket_init(const struct nft_ctx *ctx,
@@ -173,9 +197,10 @@ static int nft_socket_init(const struct nft_ctx *ctx,
case NFT_SOCKET_MARK:
len = sizeof(u32);
break;
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_SOCK_CGROUP_DATA
case NFT_SOCKET_CGROUPV2: {
unsigned int level;
+ int err;
if (!tb[NFTA_SOCKET_LEVEL])
return -EINVAL;
@@ -184,6 +209,17 @@ static int nft_socket_init(const struct nft_ctx *ctx,
if (level > 255)
return -EOPNOTSUPP;
+ err = nft_socket_cgroup_subtree_level();
+ if (err < 0)
+ return err;
+
+ priv->level_user = level;
+
+ level += err;
+ /* Implies a giant cgroup tree */
+ if (level > 255)
+ return -EOPNOTSUPP;
+
priv->level = level;
len = sizeof(u64);
break;
@@ -208,7 +244,7 @@ static int nft_socket_dump(struct sk_buff *skb,
if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
return -1;
if (priv->key == NFT_SOCKET_CGROUPV2 &&
- nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level)))
+ nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level_user)))
return -1;
return 0;
}
@@ -239,9 +275,13 @@ static bool nft_socket_reduce(struct nft_regs_track *track,
}
static int nft_socket_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
return nft_chain_validate_hooks(ctx->chain,
(1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN) |
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
index 13da882669a4..5d3e51825985 100644
--- a/net/netfilter/nft_synproxy.c
+++ b/net/netfilter/nft_synproxy.c
@@ -186,7 +186,6 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx,
break;
#endif
case NFPROTO_INET:
- case NFPROTO_BRIDGE:
err = nf_synproxy_ipv4_init(snet, ctx->net);
if (err)
goto nf_ct_failure;
@@ -219,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx)
break;
#endif
case NFPROTO_INET:
- case NFPROTO_BRIDGE:
nf_synproxy_ipv4_fini(snet, ctx->net);
nf_synproxy_ipv6_fini(snet, ctx->net);
break;
@@ -250,9 +248,13 @@ static void nft_synproxy_eval(const struct nft_expr *expr,
}
static int nft_synproxy_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_FORWARD));
}
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index ea83f661417e..50481280abd2 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -183,7 +183,7 @@ static void nft_tproxy_eval(const struct nft_expr *expr,
}
static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = {
- [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 },
+ [NFTA_TPROXY_FAMILY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 },
[NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 },
};
@@ -254,14 +254,14 @@ static int nft_tproxy_init(const struct nft_ctx *ctx,
}
if (tb[NFTA_TPROXY_REG_ADDR]) {
- err = nft_parse_register_load(tb[NFTA_TPROXY_REG_ADDR],
+ err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_ADDR],
&priv->sreg_addr, alen);
if (err < 0)
return err;
}
if (tb[NFTA_TPROXY_REG_PORT]) {
- err = nft_parse_register_load(tb[NFTA_TPROXY_REG_PORT],
+ err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_PORT],
&priv->sreg_port, sizeof(u16));
if (err < 0)
return err;
@@ -313,9 +313,13 @@ static int nft_tproxy_dump(struct sk_buff *skb,
}
static int nft_tproxy_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING);
}
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index b059aa541798..a12486ae089d 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -66,9 +66,9 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr,
}
static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = {
- [NFTA_TUNNEL_KEY] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_TUNNEL_DREG] = { .type = NLA_U32 },
- [NFTA_TUNNEL_MODE] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_MODE] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_tunnel_get_init(const struct nft_ctx *ctx,
@@ -174,8 +174,8 @@ struct nft_tunnel_opts {
struct erspan_metadata erspan;
u8 data[IP_TUNNEL_OPTS_MAX];
} u;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
u32 len;
- __be16 flags;
};
struct nft_tunnel_obj {
@@ -271,7 +271,8 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr,
opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP]));
opts->len = sizeof(struct vxlan_metadata);
- opts->flags = TUNNEL_VXLAN_OPT;
+ ip_tunnel_flags_zero(opts->flags);
+ __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags);
return 0;
}
@@ -325,7 +326,8 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
opts->u.erspan.version = version;
opts->len = sizeof(struct erspan_metadata);
- opts->flags = TUNNEL_ERSPAN_OPT;
+ ip_tunnel_flags_zero(opts->flags);
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags);
return 0;
}
@@ -333,13 +335,13 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = {
[NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 },
[NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 },
- [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
+ [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 },
};
static int nft_tunnel_obj_geneve_init(const struct nlattr *attr,
struct nft_tunnel_opts *opts)
{
- struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len;
+ struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len);
struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1];
int err, data_len;
@@ -366,7 +368,8 @@ static int nft_tunnel_obj_geneve_init(const struct nlattr *attr,
opt->length = data_len / 4;
opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]);
opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]);
- opts->flags = TUNNEL_GENEVE_OPT;
+ ip_tunnel_flags_zero(opts->flags);
+ __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags);
return 0;
}
@@ -385,8 +388,8 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
struct nft_tunnel_opts *opts)
{
struct nlattr *nla;
- __be16 type = 0;
int err, rem;
+ u32 type = 0;
err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX,
nft_tunnel_opts_policy, NULL);
@@ -401,7 +404,7 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
err = nft_tunnel_obj_vxlan_init(nla, opts);
if (err)
return err;
- type = TUNNEL_VXLAN_OPT;
+ type = IP_TUNNEL_VXLAN_OPT_BIT;
break;
case NFTA_TUNNEL_KEY_OPTS_ERSPAN:
if (type)
@@ -409,15 +412,15 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
err = nft_tunnel_obj_erspan_init(nla, opts);
if (err)
return err;
- type = TUNNEL_ERSPAN_OPT;
+ type = IP_TUNNEL_ERSPAN_OPT_BIT;
break;
case NFTA_TUNNEL_KEY_OPTS_GENEVE:
- if (type && type != TUNNEL_GENEVE_OPT)
+ if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
return -EINVAL;
err = nft_tunnel_obj_geneve_init(nla, opts);
if (err)
return err;
- type = TUNNEL_GENEVE_OPT;
+ type = IP_TUNNEL_GENEVE_OPT_BIT;
break;
default:
return -EOPNOTSUPP;
@@ -454,7 +457,9 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
memset(&info, 0, sizeof(info));
info.mode = IP_TUNNEL_INFO_TX;
info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID]));
- info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+ __set_bit(IP_TUNNEL_KEY_BIT, info.key.tun_flags);
+ __set_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags);
+ __set_bit(IP_TUNNEL_NOCACHE_BIT, info.key.tun_flags);
if (tb[NFTA_TUNNEL_KEY_IP]) {
err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info);
@@ -483,18 +488,16 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX)
- info.key.tun_flags &= ~TUNNEL_CSUM;
+ __clear_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags);
if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT)
- info.key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+ __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT,
+ info.key.tun_flags);
if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER)
- info.key.tun_flags |= TUNNEL_SEQ;
+ __set_bit(IP_TUNNEL_SEQ_BIT, info.key.tun_flags);
}
if (tb[NFTA_TUNNEL_KEY_TOS])
info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]);
- if (tb[NFTA_TUNNEL_KEY_TTL])
- info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]);
- else
- info.key.ttl = U8_MAX;
+ info.key.ttl = nla_get_u8_default(tb[NFTA_TUNNEL_KEY_TTL], U8_MAX);
if (tb[NFTA_TUNNEL_KEY_OPTS]) {
err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS],
@@ -503,13 +506,14 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
return err;
}
- md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL);
+ md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL,
+ GFP_KERNEL_ACCOUNT);
if (!md)
return -ENOMEM;
memcpy(&md->u.tun_info, &info, sizeof(info));
#ifdef CONFIG_DST_CACHE
- err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL);
+ err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL_ACCOUNT);
if (err < 0) {
metadata_dst_free(md);
return err;
@@ -583,7 +587,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
if (!nest)
return -1;
- if (opts->flags & TUNNEL_VXLAN_OPT) {
+ if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags)) {
inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN);
if (!inner)
goto failure;
@@ -591,7 +595,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
htonl(opts->u.vxlan.gbp)))
goto inner_failure;
nla_nest_end(skb, inner);
- } else if (opts->flags & TUNNEL_ERSPAN_OPT) {
+ } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags)) {
inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN);
if (!inner)
goto failure;
@@ -613,15 +617,15 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
break;
}
nla_nest_end(skb, inner);
- } else if (opts->flags & TUNNEL_GENEVE_OPT) {
+ } else if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags)) {
struct geneve_opt *opt;
int offset = 0;
- inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
- if (!inner)
- goto failure;
while (opts->len > offset) {
- opt = (struct geneve_opt *)opts->u.data + offset;
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
+ if (!inner)
+ goto failure;
+ opt = (struct geneve_opt *)(opts->u.data + offset);
if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS,
opt->opt_class) ||
nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE,
@@ -630,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
opt->length * 4, opt->opt_data))
goto inner_failure;
offset += sizeof(*opt) + opt->length * 4;
+ nla_nest_end(skb, inner);
}
- nla_nest_end(skb, inner);
}
nla_nest_end(skb, nest);
return 0;
@@ -658,11 +662,11 @@ static int nft_tunnel_flags_dump(struct sk_buff *skb,
{
u32 flags = 0;
- if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags))
flags |= NFT_TUNNEL_F_DONT_FRAGMENT;
- if (!(info->key.tun_flags & TUNNEL_CSUM))
+ if (!test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))
flags |= NFT_TUNNEL_F_ZERO_CSUM_TX;
- if (info->key.tun_flags & TUNNEL_SEQ)
+ if (test_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags))
flags |= NFT_TUNNEL_F_SEQ_NUMBER;
if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0)
@@ -713,6 +717,7 @@ static const struct nft_object_ops nft_tunnel_obj_ops = {
static struct nft_object_type nft_tunnel_obj_type __read_mostly = {
.type = NFT_OBJECT_TUNNEL,
+ .family = NFPROTO_NETDEV,
.ops = &nft_tunnel_obj_ops,
.maxattr = NFTA_TUNNEL_KEY_MAX,
.policy = nft_tunnel_key_policy,
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index c88fd078a9ae..3210cfc966ab 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -16,9 +16,9 @@
#include <net/xfrm.h>
static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = {
- [NFTA_XFRM_KEY] = { .type = NLA_U32 },
+ [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_XFRM_DIR] = { .type = NLA_U8 },
- [NFTA_XFRM_SPNUM] = { .type = NLA_U32 },
+ [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_XFRM_DREG] = { .type = NLA_U32 },
};
@@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode)
return true;
}
- return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL;
+ return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL ||
+ mode == XFRM_MODE_IPTFS;
}
static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
@@ -229,12 +230,16 @@ static int nft_xfrm_get_dump(struct sk_buff *skb,
return 0;
}
-static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
const struct nft_xfrm *priv = nft_expr_priv(expr);
unsigned int hooks;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
switch (priv->dir) {
case XFRM_POLICY_IN:
hooks = (1 << NF_INET_FORWARD) |
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 2182d361e273..008419db815a 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -179,39 +179,54 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
}
EXPORT_SYMBOL_GPL(nf_route);
-static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
+/* Only get and check the lengths, not do any hop-by-hop stuff. */
+int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen)
{
-#ifdef CONFIG_INET
- const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
-
- if (entry->state.hook == NF_INET_LOCAL_OUT) {
- const struct iphdr *iph = ip_hdr(skb);
-
- if (!(iph->tos == rt_info->tos &&
- skb->mark == rt_info->mark &&
- iph->daddr == rt_info->daddr &&
- iph->saddr == rt_info->saddr))
- return ip_route_me_harder(entry->state.net, entry->state.sk,
- skb, RTN_UNSPEC);
+ int len, off = sizeof(struct ipv6hdr);
+ unsigned char *nh;
+
+ if (!pskb_may_pull(skb, off + 8))
+ return -ENOMEM;
+ nh = (unsigned char *)(ipv6_hdr(skb) + 1);
+ len = (nh[1] + 1) << 3;
+
+ if (!pskb_may_pull(skb, off + len))
+ return -ENOMEM;
+ nh = skb_network_header(skb);
+
+ off += 2;
+ len -= 2;
+ while (len > 0) {
+ int optlen;
+
+ if (nh[off] == IPV6_TLV_PAD1) {
+ off++;
+ len--;
+ continue;
+ }
+ if (len < 2)
+ return -EBADMSG;
+ optlen = nh[off + 1] + 2;
+ if (optlen > len)
+ return -EBADMSG;
+
+ if (nh[off] == IPV6_TLV_JUMBO) {
+ u32 pkt_len;
+
+ if (nh[off + 1] != 4 || (off & 3) != 2)
+ return -EBADMSG;
+ pkt_len = ntohl(*(__be32 *)(nh + off + 2));
+ if (pkt_len <= IPV6_MAXPLEN ||
+ ipv6_hdr(skb)->payload_len)
+ return -EBADMSG;
+ if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+ return -EBADMSG;
+ *plen = pkt_len;
+ }
+ off += optlen;
+ len -= optlen;
}
-#endif
- return 0;
-}
-
-int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
-{
- const struct nf_ipv6_ops *v6ops;
- int ret = 0;
- switch (entry->state.pf) {
- case AF_INET:
- ret = nf_ip_reroute(skb, entry);
- break;
- case AF_INET6:
- v6ops = rcu_dereference(nf_ipv6_ops);
- if (v6ops)
- ret = v6ops->reroute(skb, entry);
- break;
- }
- return ret;
+ return len ? -EBADMSG : 0;
}
+EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 470282cf3fae..90b7630421c4 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -768,7 +768,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
m->u.user.match_size = msize;
strscpy(name, match->name, sizeof(name));
module_put(match->me);
- strncpy(m->u.user.name, name, sizeof(m->u.user.name));
+ strscpy_pad(m->u.user.name, name, sizeof(m->u.user.name));
*size += off;
*dstptr += msize;
@@ -1142,13 +1142,14 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
if (target->compat_from_user)
target->compat_from_user(t->data, ct->data);
else
- memcpy(t->data, ct->data, tsize - sizeof(*ct));
+ unsafe_memcpy(t->data, ct->data, tsize - sizeof(*ct),
+ /* UAPI 0-sized destination */);
tsize += off;
t->u.user.target_size = tsize;
strscpy(name, target->name, sizeof(name));
module_put(target->me);
- strncpy(t->u.user.name, name, sizeof(t->u.user.name));
+ strscpy_pad(t->u.user.name, name, sizeof(t->u.user.name));
*size += off;
*dstptr += tsize;
@@ -1268,7 +1269,7 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
/* and once again: */
list_for_each_entry(t, &xt_net->tables[af], list)
- if (strcmp(t->name, name) == 0)
+ if (strcmp(t->name, name) == 0 && owner == t->me)
return t;
module_put(owner);
@@ -1316,12 +1317,13 @@ void xt_compat_unlock(u_int8_t af)
EXPORT_SYMBOL_GPL(xt_compat_unlock);
#endif
-DEFINE_PER_CPU(seqcount_t, xt_recseq);
-EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
-
struct static_key xt_tee_enabled __read_mostly;
EXPORT_SYMBOL_GPL(xt_tee_enabled);
+#ifdef CONFIG_NETFILTER_XTABLES_LEGACY
+DEFINE_PER_CPU(seqcount_t, xt_recseq);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
+
static int xt_jumpstack_alloc(struct xt_table_info *i)
{
unsigned int size;
@@ -1513,6 +1515,7 @@ void *xt_unregister_table(struct xt_table *table)
return private;
}
EXPORT_SYMBOL_GPL(xt_unregister_table);
+#endif
#ifdef CONFIG_PROC_FS
static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
@@ -1896,6 +1899,7 @@ void xt_proto_fini(struct net *net, u_int8_t af)
}
EXPORT_SYMBOL_GPL(xt_proto_fini);
+#ifdef CONFIG_NETFILTER_XTABLES_LEGACY
/**
* xt_percpu_counter_alloc - allocate x_tables rule counter
*
@@ -1950,6 +1954,7 @@ void xt_percpu_counter_free(struct xt_counters *counters)
free_percpu((void __percpu *)pcnt);
}
EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
+#endif
static int __net_init xt_net_init(struct net *net)
{
@@ -1982,8 +1987,10 @@ static int __init xt_init(void)
unsigned int i;
int rv;
- for_each_possible_cpu(i) {
- seqcount_init(&per_cpu(xt_recseq, i));
+ if (IS_ENABLED(CONFIG_NETFILTER_XTABLES_LEGACY)) {
+ for_each_possible_cpu(i) {
+ seqcount_init(&per_cpu(xt_recseq, i));
+ }
}
xt = kcalloc(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL);
@@ -2014,4 +2021,3 @@ static void __exit xt_fini(void)
module_init(xt_init);
module_exit(xt_fini);
-
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
index c8a639f56168..9d99f5a3d176 100644
--- a/net/netfilter/xt_CHECKSUM.c
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -63,24 +63,37 @@ static int checksum_tg_check(const struct xt_tgchk_param *par)
return 0;
}
-static struct xt_target checksum_tg_reg __read_mostly = {
- .name = "CHECKSUM",
- .family = NFPROTO_UNSPEC,
- .target = checksum_tg,
- .targetsize = sizeof(struct xt_CHECKSUM_info),
- .table = "mangle",
- .checkentry = checksum_tg_check,
- .me = THIS_MODULE,
+static struct xt_target checksum_tg_reg[] __read_mostly = {
+ {
+ .name = "CHECKSUM",
+ .family = NFPROTO_IPV4,
+ .target = checksum_tg,
+ .targetsize = sizeof(struct xt_CHECKSUM_info),
+ .table = "mangle",
+ .checkentry = checksum_tg_check,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CHECKSUM",
+ .family = NFPROTO_IPV6,
+ .target = checksum_tg,
+ .targetsize = sizeof(struct xt_CHECKSUM_info),
+ .table = "mangle",
+ .checkentry = checksum_tg_check,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init checksum_tg_init(void)
{
- return xt_register_target(&checksum_tg_reg);
+ return xt_register_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg));
}
static void __exit checksum_tg_exit(void)
{
- xt_unregister_target(&checksum_tg_reg);
+ xt_unregister_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg));
}
module_init(checksum_tg_init);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index 0accac98dea7..0ae8d8a1216e 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -38,9 +38,9 @@ static struct xt_target classify_tg_reg[] __read_mostly = {
{
.name = "CLASSIFY",
.revision = 0,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
- (1 << NF_INET_POST_ROUTING),
+ (1 << NF_INET_POST_ROUTING),
.target = classify_tg,
.targetsize = sizeof(struct xt_classify_target_info),
.me = THIS_MODULE,
@@ -54,6 +54,18 @@ static struct xt_target classify_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_classify_target_info),
.me = THIS_MODULE,
},
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_POST_ROUTING),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init classify_tg_init(void)
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index 76acecf3e757..1494b3ee30e1 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -114,25 +114,39 @@ static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-static struct xt_target connsecmark_tg_reg __read_mostly = {
- .name = "CONNSECMARK",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = connsecmark_tg_check,
- .destroy = connsecmark_tg_destroy,
- .target = connsecmark_tg,
- .targetsize = sizeof(struct xt_connsecmark_target_info),
- .me = THIS_MODULE,
+static struct xt_target connsecmark_tg_reg[] __read_mostly = {
+ {
+ .name = "CONNSECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .checkentry = connsecmark_tg_check,
+ .destroy = connsecmark_tg_destroy,
+ .target = connsecmark_tg,
+ .targetsize = sizeof(struct xt_connsecmark_target_info),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CONNSECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .checkentry = connsecmark_tg_check,
+ .destroy = connsecmark_tg_destroy,
+ .target = connsecmark_tg,
+ .targetsize = sizeof(struct xt_connsecmark_target_info),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init connsecmark_tg_init(void)
{
- return xt_register_target(&connsecmark_tg_reg);
+ return xt_register_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg));
}
static void __exit connsecmark_tg_exit(void)
{
- xt_unregister_target(&connsecmark_tg_reg);
+ xt_unregister_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg));
}
module_init(connsecmark_tg_init);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 2be2f7a7b60f..3ba94c34297c 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -313,10 +313,30 @@ static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par)
xt_ct_tg_destroy(par, par->targinfo);
}
+static unsigned int
+notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ /* Previously seen (loopback)? Ignore. */
+ if (skb->_nfct != 0)
+ return XT_CONTINUE;
+
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+
+ return XT_CONTINUE;
+}
+
static struct xt_target xt_ct_tg_reg[] __read_mostly = {
{
+ .name = "NOTRACK",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = notrack_tg,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
.name = "CT",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.targetsize = sizeof(struct xt_ct_target_info),
.usersize = offsetof(struct xt_ct_target_info, ct),
.checkentry = xt_ct_tg_check_v0,
@@ -327,7 +347,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
},
{
.name = "CT",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.revision = 1,
.targetsize = sizeof(struct xt_ct_target_info_v1),
.usersize = offsetof(struct xt_ct_target_info, ct),
@@ -339,7 +359,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
},
{
.name = "CT",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.revision = 2,
.targetsize = sizeof(struct xt_ct_target_info_v1),
.usersize = offsetof(struct xt_ct_target_info, ct),
@@ -349,49 +369,61 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.table = "raw",
.me = THIS_MODULE,
},
-};
-
-static unsigned int
-notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- /* Previously seen (loopback)? Ignore. */
- if (skb->_nfct != 0)
- return XT_CONTINUE;
-
- nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
-
- return XT_CONTINUE;
-}
-
-static struct xt_target notrack_tg_reg __read_mostly = {
- .name = "NOTRACK",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .target = notrack_tg,
- .table = "raw",
- .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "NOTRACK",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .target = notrack_tg,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_IPV6,
+ .targetsize = sizeof(struct xt_ct_target_info),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
+ .checkentry = xt_ct_tg_check_v0,
+ .destroy = xt_ct_tg_destroy_v0,
+ .target = xt_ct_target_v0,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_ct_target_info_v1),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
+ .checkentry = xt_ct_tg_check_v1,
+ .destroy = xt_ct_tg_destroy_v1,
+ .target = xt_ct_target_v1,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_IPV6,
+ .revision = 2,
+ .targetsize = sizeof(struct xt_ct_target_info_v1),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
+ .checkentry = xt_ct_tg_check_v2,
+ .destroy = xt_ct_tg_destroy_v1,
+ .target = xt_ct_target_v1,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init xt_ct_tg_init(void)
{
- int ret;
-
- ret = xt_register_target(&notrack_tg_reg);
- if (ret < 0)
- return ret;
-
- ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
- if (ret < 0) {
- xt_unregister_target(&notrack_tg_reg);
- return ret;
- }
- return 0;
+ return xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
}
static void __exit xt_ct_tg_exit(void)
{
xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
- xt_unregister_target(&notrack_tg_reg);
}
module_init(xt_ct_tg_init);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 8d36303f3935..d73957592c9d 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -100,21 +100,19 @@ static void idletimer_tg_work(struct work_struct *work)
static void idletimer_tg_expired(struct timer_list *t)
{
- struct idletimer_tg *timer = from_timer(timer, t, timer);
+ struct idletimer_tg *timer = timer_container_of(timer, t, timer);
pr_debug("timer %s expired\n", timer->attr.attr.name);
schedule_work(&timer->work);
}
-static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm,
- ktime_t now)
+static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now)
{
struct idletimer_tg *timer = alarm->data;
pr_debug("alarm %s expired\n", timer->attr.attr.name);
schedule_work(&timer->work);
- return ALARMTIMER_NORESTART;
}
static int idletimer_check_sysfs_name(const char *name, unsigned int size)
@@ -170,7 +168,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
INIT_WORK(&info->timer->work, idletimer_tg_work);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
return 0;
@@ -231,7 +229,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info)
} else {
timer_setup(&info->timer->timer, idletimer_tg_expired, 0);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
}
return 0;
@@ -256,7 +254,7 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
info->label, info->timeout);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
return XT_CONTINUE;
}
@@ -277,7 +275,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb,
alarm_start_relative(&info->timer->alarm, tout);
} else {
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
}
return XT_CONTINUE;
@@ -322,7 +320,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
if (info->timer) {
info->timer->refcnt++;
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
pr_debug("increased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
@@ -384,7 +382,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par)
}
} else {
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
}
pr_debug("increased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
@@ -409,21 +407,23 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
mutex_lock(&list_mutex);
- if (--info->timer->refcnt == 0) {
- pr_debug("deleting timer %s\n", info->label);
-
- list_del(&info->timer->entry);
- timer_shutdown_sync(&info->timer->timer);
- cancel_work_sync(&info->timer->work);
- sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
- kfree(info->timer->attr.attr.name);
- kfree(info->timer);
- } else {
+ if (--info->timer->refcnt > 0) {
pr_debug("decreased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
+ mutex_unlock(&list_mutex);
+ return;
}
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
mutex_unlock(&list_mutex);
+
+ timer_shutdown_sync(&info->timer->timer);
+ cancel_work_sync(&info->timer->work);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
}
static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par)
@@ -434,52 +434,75 @@ static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par)
mutex_lock(&list_mutex);
- if (--info->timer->refcnt == 0) {
- pr_debug("deleting timer %s\n", info->label);
-
- list_del(&info->timer->entry);
- if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
- alarm_cancel(&info->timer->alarm);
- } else {
- timer_shutdown_sync(&info->timer->timer);
- }
- cancel_work_sync(&info->timer->work);
- sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
- kfree(info->timer->attr.attr.name);
- kfree(info->timer);
- } else {
+ if (--info->timer->refcnt > 0) {
pr_debug("decreased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
+ mutex_unlock(&list_mutex);
+ return;
}
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
mutex_unlock(&list_mutex);
+
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ alarm_cancel(&info->timer->alarm);
+ } else {
+ timer_shutdown_sync(&info->timer->timer);
+ }
+ cancel_work_sync(&info->timer->work);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
}
static struct xt_target idletimer_tg[] __read_mostly = {
{
- .name = "IDLETIMER",
- .family = NFPROTO_UNSPEC,
- .target = idletimer_tg_target,
- .targetsize = sizeof(struct idletimer_tg_info),
- .usersize = offsetof(struct idletimer_tg_info, timer),
- .checkentry = idletimer_tg_checkentry,
- .destroy = idletimer_tg_destroy,
- .me = THIS_MODULE,
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV4,
+ .target = idletimer_tg_target,
+ .targetsize = sizeof(struct idletimer_tg_info),
+ .usersize = offsetof(struct idletimer_tg_info, timer),
+ .checkentry = idletimer_tg_checkentry,
+ .destroy = idletimer_tg_destroy,
+ .me = THIS_MODULE,
},
{
- .name = "IDLETIMER",
- .family = NFPROTO_UNSPEC,
- .revision = 1,
- .target = idletimer_tg_target_v1,
- .targetsize = sizeof(struct idletimer_tg_info_v1),
- .usersize = offsetof(struct idletimer_tg_info_v1, timer),
- .checkentry = idletimer_tg_checkentry_v1,
- .destroy = idletimer_tg_destroy_v1,
- .me = THIS_MODULE,
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV4,
+ .revision = 1,
+ .target = idletimer_tg_target_v1,
+ .targetsize = sizeof(struct idletimer_tg_info_v1),
+ .usersize = offsetof(struct idletimer_tg_info_v1, timer),
+ .checkentry = idletimer_tg_checkentry_v1,
+ .destroy = idletimer_tg_destroy_v1,
+ .me = THIS_MODULE,
},
-
-
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV6,
+ .target = idletimer_tg_target,
+ .targetsize = sizeof(struct idletimer_tg_info),
+ .usersize = offsetof(struct idletimer_tg_info, timer),
+ .checkentry = idletimer_tg_checkentry,
+ .destroy = idletimer_tg_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .target = idletimer_tg_target_v1,
+ .targetsize = sizeof(struct idletimer_tg_info_v1),
+ .usersize = offsetof(struct idletimer_tg_info_v1, timer),
+ .checkentry = idletimer_tg_checkentry_v1,
+ .destroy = idletimer_tg_destroy_v1,
+ .me = THIS_MODULE,
+ },
+#endif
};
static struct class *idletimer_tg_class;
@@ -490,7 +513,7 @@ static int __init idletimer_tg_init(void)
{
int err;
- idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer");
+ idletimer_tg_class = class_create("xt_idletimer");
err = PTR_ERR(idletimer_tg_class);
if (IS_ERR(idletimer_tg_class)) {
pr_debug("couldn't register device class\n");
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
index 66b0f941d8fb..90dcf088071a 100644
--- a/net/netfilter/xt_LED.c
+++ b/net/netfilter/xt_LED.c
@@ -43,7 +43,6 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_led_info *ledinfo = par->targinfo;
struct xt_led_info_internal *ledinternal = ledinfo->internal_data;
- unsigned long led_delay = XT_LED_BLINK_DELAY;
/*
* If "always blink" is enabled, and there's still some time until the
@@ -52,7 +51,7 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
if ((ledinfo->delay > 0) && ledinfo->always_blink &&
timer_pending(&ledinternal->timer))
led_trigger_blink_oneshot(&ledinternal->netfilter_led_trigger,
- &led_delay, &led_delay, 1);
+ XT_LED_BLINK_DELAY, XT_LED_BLINK_DELAY, 1);
else
led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL);
@@ -73,8 +72,9 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
static void led_timeout_callback(struct timer_list *t)
{
- struct xt_led_info_internal *ledinternal = from_timer(ledinternal, t,
- timer);
+ struct xt_led_info_internal *ledinternal = timer_container_of(ledinternal,
+ t,
+ timer);
led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
}
@@ -97,7 +97,9 @@ static int led_tg_check(const struct xt_tgchk_param *par)
struct xt_led_info_internal *ledinternal;
int err;
- if (ledinfo->id[0] == '\0')
+ /* Bail out if empty string or not a string at all. */
+ if (ledinfo->id[0] == '\0' ||
+ !memchr(ledinfo->id, '\0', sizeof(ledinfo->id)))
return -EINVAL;
mutex_lock(&xt_led_mutex);
@@ -176,26 +178,41 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par)
kfree(ledinternal);
}
-static struct xt_target led_tg_reg __read_mostly = {
- .name = "LED",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .target = led_tg,
- .targetsize = sizeof(struct xt_led_info),
- .usersize = offsetof(struct xt_led_info, internal_data),
- .checkentry = led_tg_check,
- .destroy = led_tg_destroy,
- .me = THIS_MODULE,
+static struct xt_target led_tg_reg[] __read_mostly = {
+ {
+ .name = "LED",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = led_tg,
+ .targetsize = sizeof(struct xt_led_info),
+ .usersize = offsetof(struct xt_led_info, internal_data),
+ .checkentry = led_tg_check,
+ .destroy = led_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "LED",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .target = led_tg,
+ .targetsize = sizeof(struct xt_led_info),
+ .usersize = offsetof(struct xt_led_info, internal_data),
+ .checkentry = led_tg_check,
+ .destroy = led_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init led_tg_init(void)
{
- return xt_register_target(&led_tg_reg);
+ return xt_register_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg));
}
static void __exit led_tg_exit(void)
{
- xt_unregister_target(&led_tg_reg);
+ xt_unregister_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg));
}
module_init(led_tg_init);
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index e660c3710a10..6dcf4bc7e30b 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -64,25 +64,39 @@ static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
nf_logger_put(par->family, NF_LOG_TYPE_ULOG);
}
-static struct xt_target nflog_tg_reg __read_mostly = {
- .name = "NFLOG",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = nflog_tg_check,
- .destroy = nflog_tg_destroy,
- .target = nflog_tg,
- .targetsize = sizeof(struct xt_nflog_info),
- .me = THIS_MODULE,
+static struct xt_target nflog_tg_reg[] __read_mostly = {
+ {
+ .name = "NFLOG",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .checkentry = nflog_tg_check,
+ .destroy = nflog_tg_destroy,
+ .target = nflog_tg,
+ .targetsize = sizeof(struct xt_nflog_info),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "NFLOG",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .checkentry = nflog_tg_check,
+ .destroy = nflog_tg_destroy,
+ .target = nflog_tg,
+ .targetsize = sizeof(struct xt_nflog_info),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init nflog_tg_init(void)
{
- return xt_register_target(&nflog_tg_reg);
+ return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg));
}
static void __exit nflog_tg_exit(void)
{
- xt_unregister_target(&nflog_tg_reg);
+ xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg));
}
module_init(nflog_tg_init);
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 80f6624e2355..4f49cfc27831 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -179,16 +179,31 @@ static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par)
xt_rateest_put(par->net, info->est);
}
-static struct xt_target xt_rateest_tg_reg __read_mostly = {
- .name = "RATEEST",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .target = xt_rateest_tg,
- .checkentry = xt_rateest_tg_checkentry,
- .destroy = xt_rateest_tg_destroy,
- .targetsize = sizeof(struct xt_rateest_target_info),
- .usersize = offsetof(struct xt_rateest_target_info, est),
- .me = THIS_MODULE,
+static struct xt_target xt_rateest_tg_reg[] __read_mostly = {
+ {
+ .name = "RATEEST",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = xt_rateest_tg,
+ .checkentry = xt_rateest_tg_checkentry,
+ .destroy = xt_rateest_tg_destroy,
+ .targetsize = sizeof(struct xt_rateest_target_info),
+ .usersize = offsetof(struct xt_rateest_target_info, est),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "RATEEST",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .target = xt_rateest_tg,
+ .checkentry = xt_rateest_tg_checkentry,
+ .destroy = xt_rateest_tg_destroy,
+ .targetsize = sizeof(struct xt_rateest_target_info),
+ .usersize = offsetof(struct xt_rateest_target_info, est),
+ .me = THIS_MODULE,
+ },
+#endif
};
static __net_init int xt_rateest_net_init(struct net *net)
@@ -214,12 +229,12 @@ static int __init xt_rateest_tg_init(void)
if (err)
return err;
- return xt_register_target(&xt_rateest_tg_reg);
+ return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg));
}
static void __exit xt_rateest_tg_fini(void)
{
- xt_unregister_target(&xt_rateest_tg_reg);
+ xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg));
unregister_pernet_subsys(&xt_rateest_net_ops);
}
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 353ca7801251..ff66b56a3f97 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -46,7 +46,6 @@ static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-/* FIXME: Take multiple ranges --RR */
static int redirect_tg4_check(const struct xt_tgchk_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
@@ -65,7 +64,14 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
static unsigned int
redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
- return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par));
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+ struct nf_nat_range2 range = {
+ .flags = mr->range[0].flags,
+ .min_proto = mr->range[0].min,
+ .max_proto = mr->range[0].max,
+ };
+
+ return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par));
}
static struct xt_target redirect_tg_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 498a0bf6f044..5bc5ea505eb9 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -157,7 +157,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = {
{
.name = "SECMARK",
.revision = 0,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.checkentry = secmark_tg_check_v0,
.destroy = secmark_tg_destroy,
.target = secmark_tg_v0,
@@ -167,7 +167,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = {
{
.name = "SECMARK",
.revision = 1,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.checkentry = secmark_tg_check_v1,
.destroy = secmark_tg_destroy,
.target = secmark_tg_v1,
@@ -175,6 +175,29 @@ static struct xt_target secmark_tg_reg[] __read_mostly = {
.usersize = offsetof(struct xt_secmark_target_info_v1, secid),
.me = THIS_MODULE,
},
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "SECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .checkentry = secmark_tg_check_v0,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v0,
+ .targetsize = sizeof(struct xt_secmark_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "SECMARK",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .checkentry = secmark_tg_check_v1,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v1,
+ .targetsize = sizeof(struct xt_secmark_target_info_v1),
+ .usersize = offsetof(struct xt_secmark_target_info_v1, secid),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init secmark_tg_init(void)
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 30e99464171b..93f064306901 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb));
}
-#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static unsigned int
tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_tcpoptstrip_target_info),
.me = THIS_MODULE,
},
-#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "TCPOPTSTRIP",
.family = NFPROTO_IPV6,
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
index 5582dce98cae..a642ff09fc8e 100644
--- a/net/netfilter/xt_TRACE.c
+++ b/net/netfilter/xt_TRACE.c
@@ -29,25 +29,39 @@ trace_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static struct xt_target trace_tg_reg __read_mostly = {
- .name = "TRACE",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .table = "raw",
- .target = trace_tg,
- .checkentry = trace_tg_check,
- .destroy = trace_tg_destroy,
- .me = THIS_MODULE,
+static struct xt_target trace_tg_reg[] __read_mostly = {
+ {
+ .name = "TRACE",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .table = "raw",
+ .target = trace_tg,
+ .checkentry = trace_tg_check,
+ .destroy = trace_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "TRACE",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .table = "raw",
+ .target = trace_tg,
+ .checkentry = trace_tg_check,
+ .destroy = trace_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init trace_tg_init(void)
{
- return xt_register_target(&trace_tg_reg);
+ return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg));
}
static void __exit trace_tg_exit(void)
{
- xt_unregister_target(&trace_tg_reg);
+ xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg));
}
module_init(trace_tg_init);
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index e9b2181e8c42..a77088943107 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -208,13 +208,24 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = {
},
{
.name = "addrtype",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.revision = 1,
.match = addrtype_mt_v1,
.checkentry = addrtype_mt_checkentry_v1,
.matchsize = sizeof(struct xt_addrtype_info_v1),
.me = THIS_MODULE
- }
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "addrtype",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .match = addrtype_mt_v1,
+ .checkentry = addrtype_mt_checkentry_v1,
+ .matchsize = sizeof(struct xt_addrtype_info_v1),
+ .me = THIS_MODULE
+ },
+#endif
};
static int __init addrtype_mt_init(void)
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index c0f5e9a4f3c6..c437fbd59ec1 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -23,6 +23,8 @@ MODULE_DESCRIPTION("Xtables: process control group matching");
MODULE_ALIAS("ipt_cgroup");
MODULE_ALIAS("ip6t_cgroup");
+#define NET_CLS_CLASSID_INVALID_MSG "xt_cgroup: classid invalid without net_cls cgroups\n"
+
static int cgroup_mt_check_v0(const struct xt_mtchk_param *par)
{
struct xt_cgroup_info_v0 *info = par->matchinfo;
@@ -30,6 +32,11 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par)
if (info->invert & ~1)
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
return 0;
}
@@ -51,6 +58,11 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
info->priv = NULL;
if (info->has_path) {
cgrp = cgroup_get_from_path(info->path);
@@ -83,6 +95,11 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
info->priv = NULL;
if (info->has_path) {
cgrp = cgroup_get_from_path(info->path);
@@ -100,6 +117,7 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
static bool
cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
+#ifdef CONFIG_CGROUP_NET_CLASSID
const struct xt_cgroup_info_v0 *info = par->matchinfo;
struct sock *sk = skb->sk;
@@ -108,6 +126,8 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^
info->invert;
+#endif
+ return false;
}
static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
@@ -123,9 +143,12 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
if (ancestor)
return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
info->invert_path;
+#ifdef CONFIG_CGROUP_NET_CLASSID
else
return (info->classid == sock_cgroup_classid(skcd)) ^
info->invert_classid;
+#endif
+ return false;
}
static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
@@ -141,9 +164,12 @@ static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
if (ancestor)
return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
info->invert_path;
+#ifdef CONFIG_CGROUP_NET_CLASSID
else
return (info->classid == sock_cgroup_classid(skcd)) ^
info->invert_classid;
+#endif
+ return false;
}
static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index a047a545371e..908fd5f2c3c8 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -146,24 +146,37 @@ static void xt_cluster_mt_destroy(const struct xt_mtdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-static struct xt_match xt_cluster_match __read_mostly = {
- .name = "cluster",
- .family = NFPROTO_UNSPEC,
- .match = xt_cluster_mt,
- .checkentry = xt_cluster_mt_checkentry,
- .matchsize = sizeof(struct xt_cluster_match_info),
- .destroy = xt_cluster_mt_destroy,
- .me = THIS_MODULE,
+static struct xt_match xt_cluster_match[] __read_mostly = {
+ {
+ .name = "cluster",
+ .family = NFPROTO_IPV4,
+ .match = xt_cluster_mt,
+ .checkentry = xt_cluster_mt_checkentry,
+ .matchsize = sizeof(struct xt_cluster_match_info),
+ .destroy = xt_cluster_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "cluster",
+ .family = NFPROTO_IPV6,
+ .match = xt_cluster_mt,
+ .checkentry = xt_cluster_mt_checkentry,
+ .matchsize = sizeof(struct xt_cluster_match_info),
+ .destroy = xt_cluster_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init xt_cluster_mt_init(void)
{
- return xt_register_match(&xt_cluster_match);
+ return xt_register_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match));
}
static void __exit xt_cluster_mt_fini(void)
{
- xt_unregister_match(&xt_cluster_match);
+ xt_unregister_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match));
}
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 93cb018c3055..2aabdcea8707 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -111,9 +111,11 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
ret = nf_ct_netns_get(par->net, par->family);
- if (ret < 0)
+ if (ret < 0) {
pr_info_ratelimited("cannot load conntrack support for proto=%u\n",
par->family);
+ return ret;
+ }
/*
* This filter cannot function correctly unless connection tracking
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 5d04ef80a61d..848287ab79cf 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -31,8 +31,6 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
struct net *net = xt_net(par);
const struct xt_connlimit_info *info = par->matchinfo;
- struct nf_conntrack_tuple tuple;
- const struct nf_conntrack_tuple *tuple_ptr = &tuple;
const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct;
@@ -40,13 +38,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
u32 key[5];
ct = nf_ct_get(skb, &ctinfo);
- if (ct != NULL) {
- tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ if (ct)
zone = nf_ct_zone(ct);
- } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- xt_family(par), net, &tuple)) {
- goto hotdrop;
- }
if (xt_family(par) == NFPROTO_IPV6) {
const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -69,10 +62,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
key[1] = zone->id;
}
- connections = nf_conncount_count(net, info->data, key, tuple_ptr,
- zone);
+ connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key);
if (connections == 0)
- /* kmalloc failed, drop it entirely */
+ /* kmalloc failed or tuple couldn't be found, drop it entirely */
goto hotdrop;
return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT);
@@ -86,6 +78,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
{
struct xt_connlimit_info *info = par->matchinfo;
unsigned int keylen;
+ int ret;
keylen = sizeof(u32);
if (par->family == NFPROTO_IPV6)
@@ -93,8 +86,17 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
else
keylen += sizeof(struct in_addr);
+ ret = nf_ct_netns_get(par->net, par->family);
+ if (ret < 0) {
+ pr_info_ratelimited("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+ }
+
/* init private data */
- info->data = nf_conncount_init(par->net, par->family, keylen);
+ info->data = nf_conncount_init(par->net, keylen);
+ if (IS_ERR(info->data))
+ nf_ct_netns_put(par->net, par->family);
return PTR_ERR_OR_ZERO(info->data);
}
@@ -103,29 +105,45 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
{
const struct xt_connlimit_info *info = par->matchinfo;
- nf_conncount_destroy(par->net, par->family, info->data);
+ nf_conncount_destroy(par->net, info->data);
+ nf_ct_netns_put(par->net, par->family);
}
-static struct xt_match connlimit_mt_reg __read_mostly = {
- .name = "connlimit",
- .revision = 1,
- .family = NFPROTO_UNSPEC,
- .checkentry = connlimit_mt_check,
- .match = connlimit_mt,
- .matchsize = sizeof(struct xt_connlimit_info),
- .usersize = offsetof(struct xt_connlimit_info, data),
- .destroy = connlimit_mt_destroy,
- .me = THIS_MODULE,
+static struct xt_match connlimit_mt_reg[] __read_mostly = {
+ {
+ .name = "connlimit",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .usersize = offsetof(struct xt_connlimit_info, data),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "connlimit",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .usersize = offsetof(struct xt_connlimit_info, data),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init connlimit_mt_init(void)
{
- return xt_register_match(&connlimit_mt_reg);
+ return xt_register_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
}
static void __exit connlimit_mt_exit(void)
{
- xt_unregister_match(&connlimit_mt_reg);
+ xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
}
module_init(connlimit_mt_init);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index ad3c033db64e..4277084de2e7 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -151,7 +151,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
{
.name = "CONNMARK",
.revision = 1,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.checkentry = connmark_tg_check,
.target = connmark_tg,
.targetsize = sizeof(struct xt_connmark_tginfo1),
@@ -161,13 +161,35 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
{
.name = "CONNMARK",
.revision = 2,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.checkentry = connmark_tg_check,
.target = connmark_tg_v2,
.targetsize = sizeof(struct xt_connmark_tginfo2),
.destroy = connmark_tg_destroy,
.me = THIS_MODULE,
- }
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CONNMARK",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .checkentry = connmark_tg_check,
+ .target = connmark_tg,
+ .targetsize = sizeof(struct xt_connmark_tginfo1),
+ .destroy = connmark_tg_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CONNMARK",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .checkentry = connmark_tg_check,
+ .target = connmark_tg_v2,
+ .targetsize = sizeof(struct xt_connmark_tginfo2),
+ .destroy = connmark_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static struct xt_match connmark_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 0859b8f76764..3b507694e81e 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -15,7 +15,6 @@
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/slab.h>
-#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/list.h>
@@ -294,8 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
if (size < 16)
size = 16;
}
- /* FIXME: don't use vmalloc() here or anywhere else -HW */
- hinfo = vmalloc(struct_size(hinfo, hash, size));
+ hinfo = kvmalloc(struct_size(hinfo, hash, size), GFP_KERNEL);
if (hinfo == NULL)
return -ENOMEM;
*out_hinfo = hinfo;
@@ -303,7 +301,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
/* copy match config into hashtable config */
ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3);
if (ret) {
- vfree(hinfo);
+ kvfree(hinfo);
return ret;
}
@@ -322,7 +320,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
hinfo->rnd_initialized = false;
hinfo->name = kstrdup(name, GFP_KERNEL);
if (!hinfo->name) {
- vfree(hinfo);
+ kvfree(hinfo);
return -ENOMEM;
}
spin_lock_init(&hinfo->lock);
@@ -344,7 +342,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
ops, hinfo);
if (hinfo->pde == NULL) {
kfree(hinfo->name);
- vfree(hinfo);
+ kvfree(hinfo);
return -ENOMEM;
}
hinfo->net = net;
@@ -363,11 +361,15 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select
unsigned int i;
for (i = 0; i < ht->cfg.size; i++) {
+ struct hlist_head *head = &ht->hash[i];
struct dsthash_ent *dh;
struct hlist_node *n;
+ if (hlist_empty(head))
+ continue;
+
spin_lock_bh(&ht->lock);
- hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) {
+ hlist_for_each_entry_safe(dh, n, head, node) {
if (time_after_eq(jiffies, dh->expires) || select_all)
dsthash_free(ht, dh);
}
@@ -429,7 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
cancel_delayed_work_sync(&hinfo->gc_work);
htable_selective_cleanup(hinfo, true);
kfree(hinfo->name);
- vfree(hinfo);
+ kvfree(hinfo);
}
}
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
index 1873da3a945a..ca730cedb5d4 100644
--- a/net/netfilter/xt_length.c
+++ b/net/netfilter/xt_length.c
@@ -21,7 +21,7 @@ static bool
length_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_length_info *info = par->matchinfo;
- u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len);
+ u32 pktlen = skb_ip_totlen(skb);
return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
}
@@ -30,8 +30,7 @@ static bool
length_mt6(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_length_info *info = par->matchinfo;
- const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) +
- sizeof(struct ipv6hdr);
+ u32 pktlen = skb->len;
return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
}
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 1ad74b5920b5..59b9d04400ca 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -39,13 +39,35 @@ mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ((skb->mark & info->mask) == info->mark) ^ info->invert;
}
-static struct xt_target mark_tg_reg __read_mostly = {
- .name = "MARK",
- .revision = 2,
- .family = NFPROTO_UNSPEC,
- .target = mark_tg,
- .targetsize = sizeof(struct xt_mark_tginfo2),
- .me = THIS_MODULE,
+static struct xt_target mark_tg_reg[] __read_mostly = {
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_IPV4,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP)
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_ARP,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
+#endif
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
+#endif
};
static struct xt_match mark_mt_reg __read_mostly = {
@@ -61,12 +83,12 @@ static int __init mark_mt_init(void)
{
int ret;
- ret = xt_register_target(&mark_tg_reg);
+ ret = xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
if (ret < 0)
return ret;
ret = xt_register_match(&mark_mt_reg);
if (ret < 0) {
- xt_unregister_target(&mark_tg_reg);
+ xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
return ret;
}
return 0;
@@ -75,7 +97,7 @@ static int __init mark_mt_init(void)
static void __exit mark_mt_exit(void)
{
xt_unregister_match(&mark_mt_reg);
- xt_unregister_target(&mark_tg_reg);
+ xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
}
module_init(mark_mt_init);
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index 7c6bf1c16813..0ca1cdfc4095 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -38,8 +38,8 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par)
nfacct = nfnl_acct_find_get(par->net, info->name);
if (nfacct == NULL) {
- pr_info_ratelimited("accounting object `%s' does not exists\n",
- info->name);
+ pr_info_ratelimited("accounting object `%.*s' does not exist\n",
+ NFACCT_NAME_MAX, info->name);
return -ENOENT;
}
info->nfacct = nfacct;
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index e1990baf3a3b..dc9485854002 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -71,4 +71,3 @@ MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
MODULE_DESCRIPTION("Passive OS fingerprint matching.");
MODULE_ALIAS("ipt_osf");
MODULE_ALIAS("ip6t_osf");
-MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index e85ce69924ae..50332888c8d2 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -76,18 +76,23 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
*/
return false;
- filp = sk->sk_socket->file;
- if (filp == NULL)
+ read_lock_bh(&sk->sk_callback_lock);
+ filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+ if (filp == NULL) {
+ read_unlock_bh(&sk->sk_callback_lock);
return ((info->match ^ info->invert) &
(XT_OWNER_UID | XT_OWNER_GID)) == 0;
+ }
if (info->match & XT_OWNER_UID) {
kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
uid_lte(filp->f_cred->fsuid, uid_max)) ^
- !(info->invert & XT_OWNER_UID))
+ !(info->invert & XT_OWNER_UID)) {
+ read_unlock_bh(&sk->sk_callback_lock);
return false;
+ }
}
if (info->match & XT_OWNER_GID) {
@@ -112,10 +117,13 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
}
- if (match ^ !(info->invert & XT_OWNER_GID))
+ if (match ^ !(info->invert & XT_OWNER_GID)) {
+ read_unlock_bh(&sk->sk_callback_lock);
return false;
+ }
}
+ read_unlock_bh(&sk->sk_callback_lock);
return true;
}
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index ec6ed6fda96c..343e65f377d4 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -59,7 +59,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par)
(!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED)))
return false;
- physdev = nf_bridge_get_physindev(skb);
+ physdev = nf_bridge_get_physindev(skb, xt_net(par));
indev = physdev ? physdev->name : NULL;
if ((info->bitmask & XT_PHYSDEV_OP_ISIN &&
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 7ddb9a78e3fc..588a5e6ad899 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -59,9 +59,9 @@ MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* fil
/* retained for backwards compatibility */
static unsigned int ip_pkt_list_tot __read_mostly;
module_param(ip_pkt_list_tot, uint, 0400);
-MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)");
+MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 65535)");
-#define XT_RECENT_MAX_NSTAMPS 256
+#define XT_RECENT_MAX_NSTAMPS 65536
struct recent_entry {
struct list_head list;
@@ -69,7 +69,7 @@ struct recent_entry {
union nf_inet_addr addr;
u_int16_t family;
u_int8_t ttl;
- u_int8_t index;
+ u_int16_t index;
u_int16_t nstamps;
unsigned long stamps[];
};
@@ -80,7 +80,7 @@ struct recent_table {
union nf_inet_addr mask;
unsigned int refcnt;
unsigned int entries;
- u8 nstamps_max_mask;
+ u_int16_t nstamps_max_mask;
struct list_head lru_list;
struct list_head iphash[];
};
@@ -561,7 +561,7 @@ recent_mt_proc_write(struct file *file, const char __user *input,
{
struct recent_table *t = pde_data(file_inode(file));
struct recent_entry *e;
- char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
+ char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:255.255.255.255")];
const char *c = buf;
union nf_inet_addr addr = {};
u_int16_t family;
diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h
index 68ccbe50bb1e..600060ca940a 100644
--- a/net/netfilter/xt_repldata.h
+++ b/net/netfilter/xt_repldata.h
@@ -29,7 +29,7 @@
if (tbl == NULL) \
return NULL; \
term = (struct type##_error *)&(((char *)tbl)[term_offset]); \
- strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \
+ strscpy(tbl->repl.name, info->name); \
*term = (struct type##_error)typ2##_ERROR_INIT; \
tbl->repl.valid_hooks = hook_mask; \
tbl->repl.num_entries = nhooks + 1; \
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index e8961094a282..b46a6a512058 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -149,6 +149,8 @@ static int sctp_mt_check(const struct xt_mtchk_param *par)
{
const struct xt_sctp_info *info = par->matchinfo;
+ if (info->flag_count > ARRAY_SIZE(info->flag_info))
+ return -EINVAL;
if (info->flags & ~XT_SCTP_VALID_FLAGS)
return -EINVAL;
if (info->invflags & ~XT_SCTP_VALID_FLAGS)
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 7013f55f05d1..76e01f292aaf 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -77,7 +77,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent && sk_fullsock(sk))
- pskb->mark = sk->sk_mark;
+ pskb->mark = READ_ONCE(sk->sk_mark);
if (sk != skb->sk)
sock_gen_put(sk);
@@ -138,7 +138,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent && sk_fullsock(sk))
- pskb->mark = sk->sk_mark;
+ pskb->mark = READ_ONCE(sk->sk_mark);
if (sk != skb->sk)
sock_gen_put(sk);
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index 11ec2abf0c72..e8991130a3de 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -4,6 +4,7 @@
#include <linux/module.h>
#include <net/ip.h>
#include <linux/ipv6.h>
+#include <linux/icmp.h>
#include <net/ipv6.h>
#include <net/tcp.h>
#include <net/udp.h>
@@ -20,6 +21,8 @@ MODULE_ALIAS("ipt_udp");
MODULE_ALIAS("ipt_tcp");
MODULE_ALIAS("ip6t_udp");
MODULE_ALIAS("ip6t_tcp");
+MODULE_ALIAS("ipt_icmp");
+MODULE_ALIAS("ip6t_icmp6");
/* Returns 1 if the port is matched by the range, 0 otherwise */
static inline bool
@@ -161,6 +164,95 @@ static int udp_mt_check(const struct xt_mtchk_param *par)
return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0;
}
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static bool type_code_in_range(u8 test_type, u8 min_code, u8 max_code,
+ u8 type, u8 code)
+{
+ return type == test_type && code >= min_code && code <= max_code;
+}
+
+static bool icmp_type_code_match(u8 test_type, u8 min_code, u8 max_code,
+ u8 type, u8 code, bool invert)
+{
+ return (test_type == 0xFF ||
+ type_code_in_range(test_type, min_code, max_code, type, code))
+ ^ invert;
+}
+
+static bool icmp6_type_code_match(u8 test_type, u8 min_code, u8 max_code,
+ u8 type, u8 code, bool invert)
+{
+ return type_code_in_range(test_type, min_code, max_code, type, code) ^ invert;
+}
+
+static bool
+icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct icmphdr *ic;
+ struct icmphdr _icmph;
+ const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+ if (!ic) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ par->hotdrop = true;
+ return false;
+ }
+
+ return icmp_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ ic->type, ic->code,
+ !!(icmpinfo->invflags & IPT_ICMP_INV));
+}
+
+static bool
+icmp6_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct icmp6hdr *ic;
+ struct icmp6hdr _icmph;
+ const struct ip6t_icmp *icmpinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+ if (!ic) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ par->hotdrop = true;
+ return false;
+ }
+
+ return icmp6_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ ic->icmp6_type, ic->icmp6_code,
+ !!(icmpinfo->invflags & IP6T_ICMP_INV));
+}
+
+static int icmp_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+ return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0;
+}
+
+static int icmp6_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_icmp *icmpinfo = par->matchinfo;
+
+ return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0;
+}
+
static struct xt_match tcpudp_mt_reg[] __read_mostly = {
{
.name = "tcp",
@@ -216,6 +308,24 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = {
.proto = IPPROTO_UDPLITE,
.me = THIS_MODULE,
},
+ {
+ .name = "icmp",
+ .match = icmp_match,
+ .matchsize = sizeof(struct ipt_icmp),
+ .checkentry = icmp_checkentry,
+ .proto = IPPROTO_ICMP,
+ .family = NFPROTO_IPV4,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "icmp6",
+ .match = icmp6_match,
+ .matchsize = sizeof(struct ip6t_icmp),
+ .checkentry = icmp6_checkentry,
+ .proto = IPPROTO_ICMPV6,
+ .family = NFPROTO_IPV6,
+ .me = THIS_MODULE,
+ },
};
static int __init tcpudp_mt_init(void)
diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c
index 177b40d08098..117d4615d668 100644
--- a/net/netfilter/xt_u32.c
+++ b/net/netfilter/xt_u32.c
@@ -96,11 +96,32 @@ static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ret ^ data->invert;
}
+static int u32_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct xt_u32 *data = par->matchinfo;
+ const struct xt_u32_test *ct;
+ unsigned int i;
+
+ if (data->ntests > ARRAY_SIZE(data->tests))
+ return -EINVAL;
+
+ for (i = 0; i < data->ntests; ++i) {
+ ct = &data->tests[i];
+
+ if (ct->nnums > ARRAY_SIZE(ct->location) ||
+ ct->nvalues > ARRAY_SIZE(ct->value))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static struct xt_match xt_u32_mt_reg __read_mostly = {
.name = "u32",
.revision = 0,
.family = NFPROTO_UNSPEC,
.match = u32_mt,
+ .checkentry = u32_mt_checkentry,
.matchsize = sizeof(struct xt_u32),
.me = THIS_MODULE,
};